- Add TOOLCHAIN=aocc option with flang/clang++/mpicxx compilers - Replace Intel flags (-xHost/-fma/-ipo/-qopenmp) with AOCC flags (-march=znver5/-ffast-math/-flto/-fopenmp) targeting EPYC 9755 - Replace Intel oneMKL with AMD AOCL (BLIS + libFLAME + amdlibm) - Replace Intel TBBMALLOC with system jemalloc - Change MKL-specific headers to standard CBLAS/LAPACKE (TwoPunctures.C, gaussj.C) - Guard TBBMALLOC to Intel toolchain only Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
109 lines
3.8 KiB
PHP
Executable File
109 lines
3.8 KiB
PHP
Executable File
## Toolchain selection
|
|
## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI
|
|
## intel : Intel oneAPI toolchain (legacy path)
|
|
## aocc : AMD AOCC + AOCL + OpenMPI (for AMD EPYC Zen 5, with CUDA)
|
|
TOOLCHAIN ?= intel
|
|
|
|
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
|
## opt : (default) maximum performance with PGO profile-guided optimization
|
|
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
|
PGO_MODE ?= opt
|
|
|
|
## Interp_Points load balance profiling mode
|
|
## off : (default) no load balance instrumentation
|
|
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
|
## optimize : Pass 2 — read profile and apply block rebalancing
|
|
INTERP_LB_MODE ?= off
|
|
|
|
ifeq ($(INTERP_LB_MODE),profile)
|
|
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
|
|
else ifeq ($(INTERP_LB_MODE),optimize)
|
|
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
|
|
else
|
|
INTERP_LB_FLAGS =
|
|
endif
|
|
|
|
MKLROOT ?= /home/intel/oneapi/mkl/latest
|
|
MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
|
|
MKL_INC ?= -I$(MKLROOT)/include
|
|
|
|
## AMD AOCC toolchain paths (used when TOOLCHAIN=aocc)
|
|
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
|
|
OMPI_PREFIX ?= /home/aocc/aocc-openmpi
|
|
|
|
NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
|
|
CUDA_HOME ?= $(NVHPC_ROOT)/cuda
|
|
CUDA_ARCH ?= sm_80
|
|
|
|
## Kernel implementation switch
|
|
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
|
## 0 : fall back to original Fortran kernels
|
|
USE_CXX_KERNELS ?= 1
|
|
|
|
## Z4C Cartesian RHS kernel switch
|
|
## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
|
|
## 0 : use original Fortran Z4c_rhs.o
|
|
USE_CXX_Z4C_KERNELS ?= 1
|
|
|
|
## RK4 kernel implementation switch
|
|
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
|
|
## 0 : use original Fortran rungekutta4_rout.o
|
|
USE_CXX_RK4 ?= 1
|
|
|
|
## Memory allocator switch
|
|
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
|
## 0 : use system default allocator (ptmalloc)
|
|
USE_TBBMALLOC ?= 1
|
|
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
|
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
|
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
|
else
|
|
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
|
endif
|
|
|
|
ifeq ($(TOOLCHAIN),intel)
|
|
f90 = ifx
|
|
f77 = ifx
|
|
CXX = icpx
|
|
CC = icx
|
|
CLINKER = mpiicpx
|
|
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
|
|
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
|
|
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
|
|
-lifcore -limf -liomp5 -lpthread -lm -ldl \
|
|
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
|
|
else ifeq ($(TOOLCHAIN),aocc)
|
|
f90 = flang
|
|
f77 = flang
|
|
CXX = clang++
|
|
CC = clang
|
|
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
|
|
filein = -I/usr/include/ -I$(AOCL_ROOT)/include -I$(CUDA_HOME)/include
|
|
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath \
|
|
-ljemalloc -lpthread -lm -ldl -lomp \
|
|
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
|
|
else ifeq ($(TOOLCHAIN),nvhpc)
|
|
f90 = mpifort
|
|
f77 = mpifort
|
|
CXX = mpicxx
|
|
CC = mpicc
|
|
CLINKER = mpicxx
|
|
|
|
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
|
|
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
|
|
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
|
|
-lpthread -lm -ldl \
|
|
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
|
|
-fortranlibs
|
|
endif
|
|
|
|
ifeq ($(TOOLCHAIN),intel)
|
|
ifeq ($(USE_TBBMALLOC),1)
|
|
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
|
endif
|
|
endif
|
|
|
|
Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
|
|
CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
|
|
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)
|