From a0dab90bcb217d18564b2d87f2b1be7482999a06 Mon Sep 17 00:00:00 2001 From: ianchb Date: Wed, 29 Apr 2026 08:30:47 +0800 Subject: [PATCH] Switch to NVIDIA HPC Toolchain --- AMSS_NCKU_source/makefile | 40 +++++++++++------ AMSS_NCKU_source/makefile.inc | 81 +++++++++++++++++++++-------------- 2 files changed, 75 insertions(+), 46 deletions(-) diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 97de0c8..b1646b1 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -13,12 +13,15 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) ## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata +ifeq ($(TOOLCHAIN),intel) +OMP_FLAG = -qopenmp + ifeq ($(PGO_MODE),instrument) -## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability +## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) + -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS) f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) + -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG) else ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ ## PGO has been turned off, now tested and found to be negative optimization @@ -26,9 +29,24 @@ else CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) + -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS) f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) + -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG) +endif + +TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ + -fprofile-instr-use=$(TP_PROFDATA) \ + -Dfortran3 -Dnewc $(MKL_INC) +else +## NVHPC defaults: mpicc/mpicxx/mpifort wrappers +## PGO_MODE is ignored in this branch. +OMP_FLAG = -mp +CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \ + -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS) +f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \ + $(MKL_INC) $(POLINT6_FLAG) +TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \ + -Dfortran3 -Dnewc $(MKL_INC) endif .SUFFIXES: .o .f90 .C .for .cu @@ -78,17 +96,11 @@ z4c_rhs_c.o: z4c_rhs_c.C #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ -## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS -TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata -TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(TP_PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include - TwoPunctures.o: TwoPunctures.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ + ${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@ TwoPunctureABE.o: TwoPunctureABE.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ + ${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@ # Input files @@ -242,7 +254,7 @@ ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) # $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) + $(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS) clean: rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index ed578dd..b9b9ab0 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -1,28 +1,7 @@ -## GCC version (commented out) -## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran - -## Intel oneAPI version with oneMKL (Optimized for performance) -filein = -I/usr/include/ -I${MKLROOT}/include - -## Using sequential MKL (OpenMP disabled for better single-threaded performance) -## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 - -## Memory allocator switch -## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) -## 0 : use system default allocator (ptmalloc) -USE_TBBMALLOC ?= 1 -TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so -ifneq ($(wildcard $(TBBMALLOC_SO)),) -TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed -else -TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed -endif -ifeq ($(USE_TBBMALLOC),1) -LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) -endif +## Toolchain selection +## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default) +## intel : Intel oneAPI toolchain (legacy path) +TOOLCHAIN ?= nvhpc ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) ## opt : (default) maximum performance with PGO profile-guided optimization @@ -43,6 +22,14 @@ else INTERP_LB_FLAGS = endif +MKLROOT ?= /home/intel/oneapi/mkl/latest +MKL_LIBDIR ?= $(MKLROOT)/lib/intel64 +MKL_INC ?= -I$(MKLROOT)/include + +NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11 +CUDA_HOME ?= $(NVHPC_ROOT)/cuda +CUDA_ARCH ?= sm_80 + ## Kernel implementation switch ## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster) ## 0 : fall back to original Fortran kernels @@ -58,17 +45,47 @@ USE_CXX_Z4C_KERNELS ?= 1 ## 0 : use original Fortran rungekutta4_rout.o USE_CXX_RK4 ?= 1 +## Memory allocator switch +## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) +## 0 : use system default allocator (ptmalloc) +USE_TBBMALLOC ?= 1 +TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so +ifneq ($(wildcard $(TBBMALLOC_SO)),) +TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed +else +TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed +endif + +ifeq ($(TOOLCHAIN),intel) f90 = ifx f77 = ifx CXX = icpx CC = icx CLINKER = mpiicpx +filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include +LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \ + -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ + -lifcore -limf -liomp5 -lpthread -lm -ldl \ + -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart +else ifeq ($(TOOLCHAIN),nvhpc) +f90 = mpifort +f77 = mpifort +CXX = mpicxx +CC = mpicc +CLINKER = mpicxx -Cu = nvcc -CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include -#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc -CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -CUDA_ARCH ?= sm_80 -ifneq ($(strip $(CUDA_ARCH)),) -CUDA_APP_FLAGS += -arch=$(CUDA_ARCH) +filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include +LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \ + -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \ + -lpthread -lm -ldl \ + -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \ + -fortranlibs endif + +ifeq ($(USE_TBBMALLOC),1) +LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) +endif + +Cu = $(NVHPC_ROOT)/compilers/bin/nvcc +CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include +CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH) \ No newline at end of file