Compare commits

...

1 Commits
main ... legacy

Author SHA1 Message Date
3f3f16e881 Switch legacy build to GCC and OpenMPI 2026-04-13 19:39:30 +08:00
7 changed files with 224 additions and 186 deletions

View File

@@ -37,51 +37,56 @@ close(77)
end program checkFFT end program checkFFT
#endif #endif
!------------- SUBROUTINE four1(dataa,nn,isign)
! Optimized FFT using Intel oneMKL DFTI implicit none
! Mathematical equivalence: Standard DFT definition INTEGER::isign,nn
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N) double precision,dimension(2*nn)::dataa
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N) INTEGER::i,istep,j,m,mmax,n
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...] double precision::tempi,tempr
!------------- DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
SUBROUTINE four1(dataa,nn,isign) n=2*nn
use MKL_DFTI j=1
implicit none do i=1,n,2
INTEGER, intent(in) :: isign, nn if(j.gt.i)then
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa tempr=dataa(j)
tempi=dataa(j+1)
type(DFTI_DESCRIPTOR), pointer :: desc dataa(j)=dataa(i)
integer :: status dataa(j+1)=dataa(i+1)
dataa(i)=tempr
! Create DFTI descriptor for 1D complex-to-complex transform dataa(i+1)=tempi
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn) endif
if (status /= 0) return m=nn
1 if ((m.ge.2).and.(j.gt.m)) then
! Set input/output storage as interleaved complex (default) j=j-m
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE) m=m/2
if (status /= 0) then goto 1
status = DftiFreeDescriptor(desc) endif
return j=j+m
endif enddo
mmax=2
! Commit the descriptor 2 if (n.gt.mmax) then
status = DftiCommitDescriptor(desc) istep=2*mmax
if (status /= 0) then theta=6.28318530717959d0/(isign*mmax)
status = DftiFreeDescriptor(desc) wpr=-2.d0*sin(0.5d0*theta)**2
return wpi=sin(theta)
endif wr=1.d0
wi=0.d0
! Execute FFT based on direction do m=1,mmax,2
if (isign == 1) then do i=m,n,istep
! Forward FFT: exp(-2*pi*i*k*n/N) j=i+mmax
status = DftiComputeForward(desc, dataa) tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
else tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
! Backward FFT: exp(+2*pi*i*k*n/N) dataa(j)=dataa(i)-tempr
status = DftiComputeBackward(desc, dataa) dataa(j+1)=dataa(i+1)-tempi
endif dataa(i)=dataa(i)+tempr
dataa(i+1)=dataa(i+1)+tempi
! Free descriptor enddo
status = DftiFreeDescriptor(desc) wtemp=wr
wr=wr*wpr-wi*wpi+wr
return wi=wi*wpr+wtemp*wpi+wi
END SUBROUTINE four1 enddo
mmax=istep
goto 2
endif
return
END SUBROUTINE four1

View File

@@ -25,9 +25,23 @@ using namespace std;
#include <math.h> #include <math.h>
#include <complex.h> #include <complex.h>
#endif #endif
#include "TwoPunctures.h" #include "TwoPunctures.h"
#include <mkl_cblas.h>
extern "C" {
double cblas_ddot(const int, const double *, const int, const double *, const int);
double cblas_dnrm2(const int, const double *, const int);
void cblas_dgemm(const int, const int, const int,
const int, const int, const int,
const double, const double *, const int,
const double *, const int, const double,
double *, const int);
}
enum {
CblasRowMajor = 101,
CblasNoTrans = 111
};
TwoPunctures::TwoPunctures(double mp, double mm, double b, TwoPunctures::TwoPunctures(double mp, double mm, double b,
double P_plusx, double P_plusy, double P_plusz, double P_plusx, double P_plusy, double P_plusz,

View File

@@ -17,68 +17,106 @@ using namespace std;
#include <math.h> #include <math.h>
#endif #endif
// Intel oneMKL LAPACK interface /* Linear equation solution by Gauss-Jordan elimination.
#include <mkl_lapacke.h> a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
/* Linear equation solution using Intel oneMKL LAPACK. containing the right-hand side vectors. On output a is
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input replaced by its matrix inverse, and b is replaced by the
containing the right-hand side vectors. On output a is corresponding set of solution vectors. */
replaced by its matrix inverse, and b is replaced by the
corresponding set of solution vectors. int gaussj(double *a, double *b, int n)
{
Mathematical equivalence: double swap;
Solves: A * x = b => x = A^(-1) * b
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results int *indxc, *indxr, *ipiv;
within numerical precision. */ indxc = new int[n];
indxr = new int[n];
int gaussj(double *a, double *b, int n) ipiv = new int[n];
{
// Allocate pivot array and workspace int i, icol, irow, j, k, l, ll;
lapack_int *ipiv = new lapack_int[n]; double big, dum, pivinv;
lapack_int info;
for (j = 0; j < n; j++)
// Make a copy of matrix a for solving (dgesv modifies it to LU form) ipiv[j] = 0;
double *a_copy = new double[n * n]; for (i = 0; i < n; i++)
for (int i = 0; i < n * n; i++) { {
a_copy[i] = a[i]; big = 0.0;
} for (j = 0; j < n; j++)
if (ipiv[j] != 1)
// Step 1: Solve linear system A*x = b using LU decomposition for (k = 0; k < n; k++)
// LAPACKE_dgesv uses column-major by default, but we use row-major {
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1); if (ipiv[k] == 0)
{
if (info != 0) { if (fabs(a[j * n + k]) >= big)
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl; {
delete[] ipiv; big = fabs(a[j * n + k]);
delete[] a_copy; irow = j;
return 1; icol = k;
} }
}
// Step 2: Compute matrix inverse A^(-1) using LU factorization else if (ipiv[k] > 1)
// First do LU factorization of original matrix a {
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv); cout << "gaussj: Singular Matrix-1" << endl;
return 1;
if (info != 0) { }
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl; }
delete[] ipiv;
delete[] a_copy; ipiv[icol] = ipiv[icol] + 1;
return 1; if (irow != icol)
} {
for (l = 0; l < n; l++)
// Then compute inverse from LU factorization {
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv); swap = a[irow * n + l];
a[irow * n + l] = a[icol * n + l];
if (info != 0) { a[icol * n + l] = swap;
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl; }
delete[] ipiv;
delete[] a_copy; swap = b[irow];
return 1; b[irow] = b[icol];
} b[icol] = swap;
}
delete[] ipiv;
delete[] a_copy; indxr[i] = irow;
indxc[i] = icol;
return 0;
} if (a[icol * n + icol] == 0.0)
{
cout << "gaussj: Singular Matrix-2" << endl;
return 1;
}
pivinv = 1.0 / a[icol * n + icol];
a[icol * n + icol] = 1.0;
for (l = 0; l < n; l++)
a[icol * n + l] *= pivinv;
b[icol] *= pivinv;
for (ll = 0; ll < n; ll++)
if (ll != icol)
{
dum = a[ll * n + icol];
a[ll * n + icol] = 0.0;
for (l = 0; l < n; l++)
a[ll * n + l] -= a[icol * n + l] * dum;
b[ll] -= b[icol] * dum;
}
}
for (l = n - 1; l >= 0; l--)
{
if (indxr[l] != indxc[l])
for (k = 0; k < n; k++)
{
swap = a[k * n + indxr[l]];
a[k * n + indxr[l]] = a[k * n + indxc[l]];
a[k * n + indxc[l]] = swap;
}
}
delete[] indxc;
delete[] indxr;
delete[] ipiv;
return 0;
}
// for check usage // for check usage
/* /*
int main() int main()

View File

@@ -8,27 +8,16 @@ include makefile.inc
POLINT6_USE_BARY ?= 1 POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) ## Legacy GNU/OpenMPI flags
## make -> opt (PGO-guided, maximum performance) CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(PGO_MODE),instrument)
ifeq ($(PGO_MODE),instrument) CXXAPPFLAGS = $(CXXBASEFLAGS)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability f90appflags = $(F90BASEFLAGS)
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
else else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ CXXAPPFLAGS = $(CXXBASEFLAGS)
## PGO has been turned off, now tested and found to be negative optimization f90appflags = $(F90BASEFLAGS)
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu
@@ -67,17 +56,14 @@ lopsided_kodis_c.o: lopsided_kodis_c.C
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \ TwoPunctures.o: TwoPunctures.C
-Dfortran3 -Dnewc -I${MKLROOT}/include ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctures.o: TwoPunctures.C TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ ${CXX} $(TP_OPTFLAGS) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
# Input files # Input files
@@ -184,8 +170,8 @@ ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES) TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) $(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean: clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f rm *.o ABE ABEGPU TwoPunctureABE make.log -f

56
AMSS_NCKU_source/makefile.inc Executable file → Normal file
View File

@@ -1,33 +1,27 @@
## GCC version (commented out) ## Legacy GNU/OpenMPI toolchain configuration
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance) ## OpenMPI wrappers are installed but may not be on PATH.
filein = -I/usr/include/ -I${MKLROOT}/include OMPI_BIN ?= /usr/lib64/openmpi/bin
## Using sequential MKL (OpenMP disabled for better single-threaded performance) ## Wrapper compilers
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library f90 = $(OMPI_BIN)/mpifort
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 f77 = $(OMPI_BIN)/mpifort
CXX = $(OMPI_BIN)/mpicxx
CC = $(OMPI_BIN)/mpicc
CLINKER = $(OMPI_BIN)/mpicxx
## Memory allocator switch ## Extra include flags are not needed when using the OpenMPI wrappers.
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) filein =
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## BLAS/LAPACK backend:
## opt : (default) maximum performance with PGO profile-guided optimization ## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
## instrument : PGO Phase 1 instrumentation to collect fresh profile data BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
PGO_MODE ?= opt LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
## PGO build mode switch
## off : default legacy GNU build without PGO
## instrument : accepted for compatibility, currently same as off
PGO_MODE ?= off
## Interp_Points load balance profiling mode ## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation ## off : (default) no load balance instrumentation
@@ -49,17 +43,13 @@ endif
USE_CXX_KERNELS ?= 1 USE_CXX_KERNELS ?= 1
## RK4 kernel implementation switch ## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) ## 1 (default) : use C/C++ rewrite of rungekutta4_rout
## 0 : use original Fortran rungekutta4_rout.o ## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1 USE_CXX_RK4 ?= 1
f90 = ifx ## OpenMP is only used for TwoPunctures on the legacy toolchain.
f77 = ifx TP_OPENMP_FLAGS ?= -fopenmp
CXX = icpx
CC = icx
CLINKER = mpiicpx
Cu = nvcc Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc

View File

@@ -93,11 +93,13 @@ Here, we take the Ubuntu 22.04 system as an example
#### How to use AMSS-NCKU #### How to use AMSS-NCKU
0. Setting the parameters for compilation 0. Setting the parameters for compilation
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer. Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
The settings for the Ubuntu 22.04 system do not need to be modified. The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
1. Enter the AMSS-NCKU Python code folder and modify the input. 1. Enter the AMSS-NCKU Python code folder and modify the input.

View File

@@ -9,6 +9,7 @@
import AMSS_NCKU_Input as input_data import AMSS_NCKU_Input as input_data
import os
import subprocess import subprocess
import time import time
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
## Build parallelism: match the number of bound cores ## Build parallelism: match the number of bound cores
BUILD_JOBS = 64 BUILD_JOBS = 64
OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
################################################################## ##################################################################
@@ -147,11 +150,11 @@ def run_ABE():
## Define the command to run; cast other values to strings as needed ## Define the command to run; cast other values to strings as needed
if (input_data.GPU_Calculation == "no"): if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" #mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log" mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"): elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU" mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command_outfile = "ABEGPU_out.log" mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output ## Execute the MPI command and stream output