From 3f3f16e881e6c60e566d6bd5851792f373a26b13 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 13 Apr 2026 19:39:30 +0800 Subject: [PATCH] Switch legacy build to GCC and OpenMPI --- AMSS_NCKU_source/FFT.f90 | 101 ++++++++++---------- AMSS_NCKU_source/TwoPunctures.C | 20 +++- AMSS_NCKU_source/gaussj.C | 162 ++++++++++++++++++++------------ AMSS_NCKU_source/makefile | 52 ++++------ AMSS_NCKU_source/makefile.inc | 56 +++++------ README.md | 12 ++- makefile_and_run.py | 7 +- 7 files changed, 224 insertions(+), 186 deletions(-) mode change 100755 => 100644 AMSS_NCKU_source/makefile.inc diff --git a/AMSS_NCKU_source/FFT.f90 b/AMSS_NCKU_source/FFT.f90 index 7dfe727..57acf6e 100644 --- a/AMSS_NCKU_source/FFT.f90 +++ b/AMSS_NCKU_source/FFT.f90 @@ -37,51 +37,56 @@ close(77) end program checkFFT #endif -!------------- -! Optimized FFT using Intel oneMKL DFTI -! Mathematical equivalence: Standard DFT definition -! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N) -! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N) -! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...] -!------------- -SUBROUTINE four1(dataa,nn,isign) -use MKL_DFTI -implicit none -INTEGER, intent(in) :: isign, nn -DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa - -type(DFTI_DESCRIPTOR), pointer :: desc -integer :: status - -! Create DFTI descriptor for 1D complex-to-complex transform -status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) -if (status /= 0) return - -! Set input/output storage as interleaved complex (default) -status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) -if (status /= 0) then - status = DftiFreeDescriptor(desc) - return -endif - -! Commit the descriptor -status = DftiCommitDescriptor(desc) -if (status /= 0) then - status = DftiFreeDescriptor(desc) - return -endif - -! Execute FFT based on direction -if (isign == 1) then - ! Forward FFT: exp(-2*pi*i*k*n/N) - status = DftiComputeForward(desc, dataa) -else - ! Backward FFT: exp(+2*pi*i*k*n/N) - status = DftiComputeBackward(desc, dataa) -endif - -! Free descriptor -status = DftiFreeDescriptor(desc) - -return -END SUBROUTINE four1 +SUBROUTINE four1(dataa,nn,isign) +implicit none +INTEGER::isign,nn +double precision,dimension(2*nn)::dataa +INTEGER::i,istep,j,m,mmax,n +double precision::tempi,tempr +DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp +n=2*nn +j=1 +do i=1,n,2 + if(j.gt.i)then + tempr=dataa(j) + tempi=dataa(j+1) + dataa(j)=dataa(i) + dataa(j+1)=dataa(i+1) + dataa(i)=tempr + dataa(i+1)=tempi + endif + m=nn +1 if ((m.ge.2).and.(j.gt.m)) then + j=j-m + m=m/2 +goto 1 + endif +j=j+m +enddo +mmax=2 +2 if (n.gt.mmax) then + istep=2*mmax + theta=6.28318530717959d0/(isign*mmax) + wpr=-2.d0*sin(0.5d0*theta)**2 + wpi=sin(theta) + wr=1.d0 + wi=0.d0 + do m=1,mmax,2 + do i=m,n,istep + j=i+mmax + tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1) + tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j) + dataa(j)=dataa(i)-tempr + dataa(j+1)=dataa(i+1)-tempi + dataa(i)=dataa(i)+tempr + dataa(i+1)=dataa(i+1)+tempi + enddo + wtemp=wr + wr=wr*wpr-wi*wpi+wr + wi=wi*wpr+wtemp*wpi+wi + enddo +mmax=istep +goto 2 +endif +return +END SUBROUTINE four1 diff --git a/AMSS_NCKU_source/TwoPunctures.C b/AMSS_NCKU_source/TwoPunctures.C index 1b6e590..c6614dc 100644 --- a/AMSS_NCKU_source/TwoPunctures.C +++ b/AMSS_NCKU_source/TwoPunctures.C @@ -25,9 +25,23 @@ using namespace std; #include #include #endif - -#include "TwoPunctures.h" -#include + +#include "TwoPunctures.h" + +extern "C" { +double cblas_ddot(const int, const double *, const int, const double *, const int); +double cblas_dnrm2(const int, const double *, const int); +void cblas_dgemm(const int, const int, const int, + const int, const int, const int, + const double, const double *, const int, + const double *, const int, const double, + double *, const int); +} + +enum { + CblasRowMajor = 101, + CblasNoTrans = 111 +}; TwoPunctures::TwoPunctures(double mp, double mm, double b, double P_plusx, double P_plusy, double P_plusz, diff --git a/AMSS_NCKU_source/gaussj.C b/AMSS_NCKU_source/gaussj.C index 86c7777..c562205 100644 --- a/AMSS_NCKU_source/gaussj.C +++ b/AMSS_NCKU_source/gaussj.C @@ -17,68 +17,106 @@ using namespace std; #include #endif -// Intel oneMKL LAPACK interface -#include -/* Linear equation solution using Intel oneMKL LAPACK. -a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input -containing the right-hand side vectors. On output a is -replaced by its matrix inverse, and b is replaced by the -corresponding set of solution vectors. - -Mathematical equivalence: - Solves: A * x = b => x = A^(-1) * b - Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results - within numerical precision. */ - -int gaussj(double *a, double *b, int n) -{ - // Allocate pivot array and workspace - lapack_int *ipiv = new lapack_int[n]; - lapack_int info; - - // Make a copy of matrix a for solving (dgesv modifies it to LU form) - double *a_copy = new double[n * n]; - for (int i = 0; i < n * n; i++) { - a_copy[i] = a[i]; - } - - // Step 1: Solve linear system A*x = b using LU decomposition - // LAPACKE_dgesv uses column-major by default, but we use row-major - info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); - - if (info != 0) { - cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; - delete[] ipiv; - delete[] a_copy; - return 1; - } - - // Step 2: Compute matrix inverse A^(-1) using LU factorization - // First do LU factorization of original matrix a - info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv); - - if (info != 0) { - cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl; - delete[] ipiv; - delete[] a_copy; - return 1; - } - - // Then compute inverse from LU factorization - info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv); - - if (info != 0) { - cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl; - delete[] ipiv; - delete[] a_copy; - return 1; - } - - delete[] ipiv; - delete[] a_copy; - - return 0; -} +/* Linear equation solution by Gauss-Jordan elimination. +a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input +containing the right-hand side vectors. On output a is +replaced by its matrix inverse, and b is replaced by the +corresponding set of solution vectors. */ + +int gaussj(double *a, double *b, int n) +{ + double swap; + + int *indxc, *indxr, *ipiv; + indxc = new int[n]; + indxr = new int[n]; + ipiv = new int[n]; + + int i, icol, irow, j, k, l, ll; + double big, dum, pivinv; + + for (j = 0; j < n; j++) + ipiv[j] = 0; + for (i = 0; i < n; i++) + { + big = 0.0; + for (j = 0; j < n; j++) + if (ipiv[j] != 1) + for (k = 0; k < n; k++) + { + if (ipiv[k] == 0) + { + if (fabs(a[j * n + k]) >= big) + { + big = fabs(a[j * n + k]); + irow = j; + icol = k; + } + } + else if (ipiv[k] > 1) + { + cout << "gaussj: Singular Matrix-1" << endl; + return 1; + } + } + + ipiv[icol] = ipiv[icol] + 1; + if (irow != icol) + { + for (l = 0; l < n; l++) + { + swap = a[irow * n + l]; + a[irow * n + l] = a[icol * n + l]; + a[icol * n + l] = swap; + } + + swap = b[irow]; + b[irow] = b[icol]; + b[icol] = swap; + } + + indxr[i] = irow; + indxc[i] = icol; + + if (a[icol * n + icol] == 0.0) + { + cout << "gaussj: Singular Matrix-2" << endl; + return 1; + } + + pivinv = 1.0 / a[icol * n + icol]; + a[icol * n + icol] = 1.0; + for (l = 0; l < n; l++) + a[icol * n + l] *= pivinv; + b[icol] *= pivinv; + for (ll = 0; ll < n; ll++) + if (ll != icol) + { + dum = a[ll * n + icol]; + a[ll * n + icol] = 0.0; + for (l = 0; l < n; l++) + a[ll * n + l] -= a[icol * n + l] * dum; + b[ll] -= b[icol] * dum; + } + } + + for (l = n - 1; l >= 0; l--) + { + if (indxr[l] != indxc[l]) + for (k = 0; k < n; k++) + { + swap = a[k * n + indxr[l]]; + a[k * n + indxr[l]] = a[k * n + indxc[l]]; + a[k * n + indxc[l]] = swap; + } + } + + delete[] indxc; + delete[] indxr; + delete[] ipiv; + + return 0; +} // for check usage /* int main() diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 72b9cbd..e0eb23c 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -8,27 +8,16 @@ include makefile.inc POLINT6_USE_BARY ?= 1 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) -## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) -## make -> opt (PGO-guided, maximum performance) -## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) -PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata - -ifeq ($(PGO_MODE),instrument) -## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +## Legacy GNU/OpenMPI flags +CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS) +F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG) + +ifeq ($(PGO_MODE),instrument) +CXXAPPFLAGS = $(CXXBASEFLAGS) +f90appflags = $(F90BASEFLAGS) else -## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ -## PGO has been turned off, now tested and found to be negative optimization -## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization - - -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) +CXXAPPFLAGS = $(CXXBASEFLAGS) +f90appflags = $(F90BASEFLAGS) endif .SUFFIXES: .o .f90 .C .for .cu @@ -67,17 +56,14 @@ lopsided_kodis_c.o: lopsided_kodis_c.C #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ -## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS -TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata -TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(TP_PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include - -TwoPunctures.o: TwoPunctures.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ - -TwoPunctureABE.o: TwoPunctureABE.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ +## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS +TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS) + +TwoPunctures.o: TwoPunctures.C + ${CXX} $(TP_OPTFLAGS) -c $< -o $@ + +TwoPunctureABE.o: TwoPunctureABE.C + ${CXX} $(TP_OPTFLAGS) -c $< -o $@ # Input files @@ -184,8 +170,8 @@ ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) -TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) +TwoPunctureABE: $(TwoPunctureFILES) + $(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS) clean: rm *.o ABE ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc old mode 100755 new mode 100644 index 331cff1..4b1db6f --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -1,33 +1,27 @@ -## GCC version (commented out) -## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran +## Legacy GNU/OpenMPI toolchain configuration -## Intel oneAPI version with oneMKL (Optimized for performance) -filein = -I/usr/include/ -I${MKLROOT}/include +## OpenMPI wrappers are installed but may not be on PATH. +OMPI_BIN ?= /usr/lib64/openmpi/bin -## Using sequential MKL (OpenMP disabled for better single-threaded performance) -## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 +## Wrapper compilers +f90 = $(OMPI_BIN)/mpifort +f77 = $(OMPI_BIN)/mpifort +CXX = $(OMPI_BIN)/mpicxx +CC = $(OMPI_BIN)/mpicc +CLINKER = $(OMPI_BIN)/mpicxx -## Memory allocator switch -## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) -## 0 : use system default allocator (ptmalloc) -USE_TBBMALLOC ?= 1 -TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so -ifneq ($(wildcard $(TBBMALLOC_SO)),) -TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed -else -TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed -endif -ifeq ($(USE_TBBMALLOC),1) -LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) -endif +## Extra include flags are not needed when using the OpenMPI wrappers. +filein = -## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) -## opt : (default) maximum performance with PGO profile-guided optimization -## instrument : PGO Phase 1 instrumentation to collect fresh profile data -PGO_MODE ?= opt +## BLAS/LAPACK backend: +## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols. +BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0 +LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl + +## PGO build mode switch +## off : default legacy GNU build without PGO +## instrument : accepted for compatibility, currently same as off +PGO_MODE ?= off ## Interp_Points load balance profiling mode ## off : (default) no load balance instrumentation @@ -49,17 +43,13 @@ endif USE_CXX_KERNELS ?= 1 ## RK4 kernel implementation switch -## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) +## 1 (default) : use C/C++ rewrite of rungekutta4_rout ## 0 : use original Fortran rungekutta4_rout.o USE_CXX_RK4 ?= 1 -f90 = ifx -f77 = ifx -CXX = icpx -CC = icx -CLINKER = mpiicpx +## OpenMP is only used for TwoPunctures on the legacy toolchain. +TP_OPENMP_FLAGS ?= -fopenmp Cu = nvcc CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include -#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc diff --git a/README.md b/README.md index 0f83d45..2b854bd 100644 --- a/README.md +++ b/README.md @@ -93,11 +93,13 @@ Here, we take the Ubuntu 22.04 system as an example #### How to use AMSS-NCKU -0. Setting the parameters for compilation - - Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer. - - The settings for the Ubuntu 22.04 system do not need to be modified. +0. Setting the parameters for compilation + + Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer. + + The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`. + + If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher. 1. Enter the AMSS-NCKU Python code folder and modify the input. diff --git a/makefile_and_run.py b/makefile_and_run.py index 5682476..6f5c6fa 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -9,6 +9,7 @@ import AMSS_NCKU_Input as input_data +import os import subprocess import time @@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32) ## Build parallelism: match the number of bound cores BUILD_JOBS = 64 +OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin") +MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun") ################################################################## @@ -147,11 +150,11 @@ def run_ABE(): ## Define the command to run; cast other values to strings as needed if (input_data.GPU_Calculation == "no"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" + mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command_outfile = "ABE_out.log" elif (input_data.GPU_Calculation == "yes"): - mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" + mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command_outfile = "ABEGPU_out.log" ## Execute the MPI command and stream output