Switch legacy build to GCC and OpenMPI

2026-04-13 19:39:30 +08:00
7 changed files with 224 additions and 186 deletions
--- a/AMSS_NCKU_source/FFT.f90
+++ b/AMSS_NCKU_source/FFT.f90
@@ -37,51 +37,56 @@ close(77)
 end program checkFFT
 #endif
-!-------------
+SUBROUTINE four1(dataa,nn,isign)
-! Optimized FFT using Intel oneMKL DFTI
+implicit none
-! Mathematical equivalence: Standard DFT definition
+INTEGER::isign,nn
-!   Forward (isign=1):  X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
+double precision,dimension(2*nn)::dataa
-!   Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
+INTEGER::i,istep,j,m,mmax,n
-! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
+double precision::tempi,tempr
-!-------------
+DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
-SUBROUTINE four1(dataa,nn,isign)
+n=2*nn
-use MKL_DFTI
+j=1
-implicit none
+do i=1,n,2
-INTEGER, intent(in) :: isign, nn
+  if(j.gt.i)then
-DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
+     tempr=dataa(j)
-
+     tempi=dataa(j+1)
-type(DFTI_DESCRIPTOR), pointer :: desc
+     dataa(j)=dataa(i)
-integer :: status
+     dataa(j+1)=dataa(i+1)
-
+     dataa(i)=tempr
-! Create DFTI descriptor for 1D complex-to-complex transform
+     dataa(i+1)=tempi
-status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
+  endif
-if (status /= 0) return
+  m=nn
-
+1 if ((m.ge.2).and.(j.gt.m)) then
-! Set input/output storage as interleaved complex (default)
+  j=j-m
-status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+  m=m/2
-if (status /= 0) then
+goto 1
-   status = DftiFreeDescriptor(desc)
+  endif
-   return
+j=j+m
-endif
+enddo
-
+mmax=2
-! Commit the descriptor
+2  if (n.gt.mmax) then
-status = DftiCommitDescriptor(desc)
+     istep=2*mmax
-if (status /= 0) then
+     theta=6.28318530717959d0/(isign*mmax)
-   status = DftiFreeDescriptor(desc)
+     wpr=-2.d0*sin(0.5d0*theta)**2
-   return
+     wpi=sin(theta)
-endif
+     wr=1.d0
-
+     wi=0.d0
-! Execute FFT based on direction
+     do m=1,mmax,2
-if (isign == 1) then
+       do i=m,n,istep
-   ! Forward FFT: exp(-2*pi*i*k*n/N)
+         j=i+mmax
-   status = DftiComputeForward(desc, dataa)
+         tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
-else
+         tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
-   ! Backward FFT: exp(+2*pi*i*k*n/N)
+         dataa(j)=dataa(i)-tempr
-   status = DftiComputeBackward(desc, dataa)
+         dataa(j+1)=dataa(i+1)-tempi
-endif
+         dataa(i)=dataa(i)+tempr
-
+         dataa(i+1)=dataa(i+1)+tempi
-! Free descriptor
+       enddo
-status = DftiFreeDescriptor(desc)
+          wtemp=wr
-
+          wr=wr*wpr-wi*wpi+wr
-return
+          wi=wi*wpr+wtemp*wpi+wi
-END SUBROUTINE four1
+     enddo
 mmax=istep
 goto 2
 endif
 return
 END SUBROUTINE four1
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -25,9 +25,23 @@ using namespace std;
 #include <math.h>
 #include <complex.h>
 #endif
-
+
-#include "TwoPunctures.h"
+#include "TwoPunctures.h"
-#include <mkl_cblas.h>
+
 extern "C" {
 double cblas_ddot(const int, const double *, const int, const double *, const int);
 double cblas_dnrm2(const int, const double *, const int);
 void cblas_dgemm(const int, const int, const int,
                 const int, const int, const int,
                 const double, const double *, const int,
                 const double *, const int, const double,
                 double *, const int);
 }
 enum {
  CblasRowMajor = 101,
  CblasNoTrans = 111
 };
 TwoPunctures::TwoPunctures(double mp, double mm, double b,
                           double P_plusx, double P_plusy, double P_plusz,
--- a/AMSS_NCKU_source/gaussj.C
+++ b/AMSS_NCKU_source/gaussj.C
@@ -17,68 +17,106 @@ using namespace std;
 #include <math.h>
 #endif
-// Intel oneMKL LAPACK interface
+/* Linear equation solution by Gauss-Jordan elimination.
-#include <mkl_lapacke.h>
+a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
-/* Linear equation solution using Intel oneMKL LAPACK.
+containing the right-hand side vectors. On output a is
-a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
+replaced by its matrix inverse, and b is replaced by the
-containing the right-hand side vectors. On output a is
+corresponding set of solution vectors. */
-replaced by its matrix inverse, and b is replaced by the
+
-corresponding set of solution vectors.
+int gaussj(double *a, double *b, int n)
-
+{
-Mathematical equivalence:
+  double swap;
-  Solves: A * x = b  =>  x = A^(-1) * b
+
-  Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
+  int *indxc, *indxr, *ipiv;
-  within numerical precision. */
+  indxc = new int[n];
-
+  indxr = new int[n];
-int gaussj(double *a, double *b, int n)
+  ipiv = new int[n];
-{
+
-  // Allocate pivot array and workspace
+  int i, icol, irow, j, k, l, ll;
-  lapack_int *ipiv = new lapack_int[n];
+  double big, dum, pivinv;
-  lapack_int info;
+
-
+  for (j = 0; j < n; j++)
-  // Make a copy of matrix a for solving (dgesv modifies it to LU form)
+    ipiv[j] = 0;
-  double *a_copy = new double[n * n];
+  for (i = 0; i < n; i++)
-  for (int i = 0; i < n * n; i++) {
+  {
-    a_copy[i] = a[i];
+    big = 0.0;
-  }
+    for (j = 0; j < n; j++)
-
+      if (ipiv[j] != 1)
-  // Step 1: Solve linear system A*x = b using LU decomposition
+        for (k = 0; k < n; k++)
-  // LAPACKE_dgesv uses column-major by default, but we use row-major
+        {
-  info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
+          if (ipiv[k] == 0)
-
+          {
-  if (info != 0) {
+            if (fabs(a[j * n + k]) >= big)
-    cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
+            {
-    delete[] ipiv;
+              big = fabs(a[j * n + k]);
-    delete[] a_copy;
+              irow = j;
-    return 1;
+              icol = k;
-  }
+            }
-
+          }
-  // Step 2: Compute matrix inverse A^(-1) using LU factorization
+          else if (ipiv[k] > 1)
-  // First do LU factorization of original matrix a
+          {
-  info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
+            cout << "gaussj: Singular Matrix-1" << endl;
-
+            return 1;
-  if (info != 0) {
+          }
-    cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
+        }
-    delete[] ipiv;
+
-    delete[] a_copy;
+    ipiv[icol] = ipiv[icol] + 1;
-    return 1;
+    if (irow != icol)
-  }
+    {
-
+      for (l = 0; l < n; l++)
-  // Then compute inverse from LU factorization
+      {
-  info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
+        swap = a[irow * n + l];
-
+        a[irow * n + l] = a[icol * n + l];
-  if (info != 0) {
+        a[icol * n + l] = swap;
-    cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
+      }
-    delete[] ipiv;
+
-    delete[] a_copy;
+      swap = b[irow];
-    return 1;
+      b[irow] = b[icol];
-  }
+      b[icol] = swap;
-
+    }
-  delete[] ipiv;
+
-  delete[] a_copy;
+    indxr[i] = irow;
-
+    indxc[i] = icol;
-  return 0;
+
-}
+    if (a[icol * n + icol] == 0.0)
    {
      cout << "gaussj: Singular Matrix-2" << endl;
      return 1;
    }
    pivinv = 1.0 / a[icol * n + icol];
    a[icol * n + icol] = 1.0;
    for (l = 0; l < n; l++)
      a[icol * n + l] *= pivinv;
    b[icol] *= pivinv;
    for (ll = 0; ll < n; ll++)
      if (ll != icol)
      {
        dum = a[ll * n + icol];
        a[ll * n + icol] = 0.0;
        for (l = 0; l < n; l++)
          a[ll * n + l] -= a[icol * n + l] * dum;
        b[ll] -= b[icol] * dum;
      }
  }
  for (l = n - 1; l >= 0; l--)
  {
    if (indxr[l] != indxc[l])
      for (k = 0; k < n; k++)
      {
        swap = a[k * n + indxr[l]];
        a[k * n + indxr[l]] = a[k * n + indxc[l]];
        a[k * n + indxc[l]] = swap;
      }
  }
  delete[] indxc;
  delete[] indxr;
  delete[] ipiv;
  return 0;
 }
 // for check usage
 /*
 int main()
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -8,27 +8,16 @@ include makefile.inc
 POLINT6_USE_BARY ?= 1
 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
-## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
+## Legacy GNU/OpenMPI flags
-##   make                        -> opt  (PGO-guided, maximum performance)
+CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
-##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
+F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
-PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
+
-
+ifeq ($(PGO_MODE),instrument)
-ifeq ($(PGO_MODE),instrument)
+CXXAPPFLAGS = $(CXXBASEFLAGS)
-## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
+f90appflags = $(F90BASEFLAGS)
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
-## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
+CXXAPPFLAGS = $(CXXBASEFLAGS)
-## PGO has been turned off, now tested and found to be negative optimization
+f90appflags = $(F90BASEFLAGS)
 ## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
 .SUFFIXES: .o .f90 .C .for .cu
@@ -67,17 +56,14 @@ lopsided_kodis_c.o: lopsided_kodis_c.C
 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
 #	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
-## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
+## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
-TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
+TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
-TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
+
-              -fprofile-instr-use=$(TP_PROFDATA) \
+TwoPunctures.o: TwoPunctures.C
-              -Dfortran3 -Dnewc -I${MKLROOT}/include
+	${CXX} $(TP_OPTFLAGS) -c $< -o $@
-
+
-TwoPunctures.o: TwoPunctures.C
+TwoPunctureABE.o: TwoPunctureABE.C
-	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -c $< -o $@
 TwoPunctureABE.o: TwoPunctureABE.C
 	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
 # Input files
@@ -184,8 +170,8 @@ ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
 ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
 	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
-TwoPunctureABE: $(TwoPunctureFILES)
+TwoPunctureABE: $(TwoPunctureFILES)
-	$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
+	$(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
 clean:
 	rm *.o ABE ABEGPU TwoPunctureABE make.log -f
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,33 +1,27 @@
-## GCC version (commented out)
+## Legacy GNU/OpenMPI toolchain configuration
 ## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
 ## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
-## Intel oneAPI version with oneMKL (Optimized for performance)
+## OpenMPI wrappers are installed but may not be on PATH.
-filein  = -I/usr/include/ -I${MKLROOT}/include
+OMPI_BIN ?= /usr/lib64/openmpi/bin
-## Using sequential MKL (OpenMP disabled for better single-threaded performance)
+## Wrapper compilers
-## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
+f90          = $(OMPI_BIN)/mpifort
-LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
+f77          = $(OMPI_BIN)/mpifort
 CXX          = $(OMPI_BIN)/mpicxx
 CC           = $(OMPI_BIN)/mpicc
 CLINKER      = $(OMPI_BIN)/mpicxx
-## Memory allocator switch
+## Extra include flags are not needed when using the OpenMPI wrappers.
-##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
+filein       =
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
-## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
+## BLAS/LAPACK backend:
-##   opt        : (default) maximum performance with PGO profile-guided optimization
+## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
-##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
+BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
-PGO_MODE ?= opt
+LDLIBS  = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
 ## PGO build mode switch
 ##   off        : default legacy GNU build without PGO
 ##   instrument : accepted for compatibility, currently same as off
 PGO_MODE ?= off
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
@@ -49,17 +43,13 @@ endif
 USE_CXX_KERNELS ?= 1
 ## RK4 kernel implementation switch
-##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
+##   1 (default) : use C/C++ rewrite of rungekutta4_rout
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1
-f90          = ifx
+## OpenMP is only used for TwoPunctures on the legacy toolchain.
-f77          = ifx
+TP_OPENMP_FLAGS ?= -fopenmp
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx
 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
 #CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
--- a/README.md
+++ b/README.md
@@ -93,11 +93,13 @@ Here, we take the Ubuntu 22.04 system as an example
 #### How to use AMSS-NCKU
-0.  Setting the parameters for compilation
+0.  Setting the parameters for compilation
-
+
-    Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
+    Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
-
+
-    The settings for the Ubuntu 22.04 system do not need to be modified.
+    The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
    If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
 1.  Enter the AMSS-NCKU Python code folder and modify the input.
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,7 @@
 import AMSS_NCKU_Input as input_data
 import os
 import subprocess
 import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 ## Build parallelism: match the number of bound cores
 BUILD_JOBS = 64
 OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
 MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
 ##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
    ## Define the command to run; cast other values to strings as needed
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
    ## Execute the MPI command and stream output