Files
AMSS-NCKU/AMSS_NCKU_source/makefile.inc
CGH0S7 3d3a3ba759 Add thread-safe ShellPatch::setupintintstuff with OpenMP
Split prolongpointstru into search-only (prolongpointstru_search) and
append-only (prolongpointstru_append) functions. Parallelize shell-point
interpolation table construction with #pragma omp parallel for collapse(3)
and per-thread linked lists (merged after the loop to avoid data races).

Add OMP_FLAG = -fopenmp in makefile.inc and ShellPatch.o override rule
in makefile for AOCC OpenMP runtime (-lomp already linked).

Speedup: setupintintstuff ~2.2x faster on multi-core.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 22:09:10 +08:00

69 lines
2.4 KiB
PHP
Executable File

## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
## AOCL root path for includes and libraries
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
## AOCC-built OpenMPI prefix
OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10
filein = -I/usr/include/ -I$(AOCL_ROOT)/include
## Using AOCL BLIS + libFLAME for BLAS/LAPACK
## AOCC Fortran runtime: -lflang (includes FortranRuntime)
## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
# OpenMP flag for selective compilation
OMP_FLAG = -fopenmp
## Interp_Points load balance profiling mode
## off : (default) no load balance instrumentation
## profile : Pass 1 — instrument Interp_Points to collect timing profile
## optimize : Pass 2 — read profile and apply block rebalancing
INTERP_LB_MODE ?= off
ifeq ($(INTERP_LB_MODE),profile)
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
else ifeq ($(INTERP_LB_MODE),optimize)
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
else
INTERP_LB_FLAGS =
endif
## Kernel implementation switch
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 : fall back to original Fortran kernels
USE_CXX_KERNELS ?= 1
## Z4C Cartesian RHS kernel switch
## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
## 0 : use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 1
## BSSN-EScalar RHS switch
## 1 (default) : use BSSN-EScalar C wrapper on the normal patch path
## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
USE_CXX_ESCALAR_KERNEL ?= 1
## Cached transfer switch
## auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
## 1 : force cached Sync/Restrict/OutBd transfer on evolution hot paths
## 0 : force the original uncached transfer path
USE_TRANSFER_CACHE ?= auto
## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1
f90 = flang
f77 = flang
CXX = clang++
CC = clang
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc