Split prolongpointstru into search-only (prolongpointstru_search) and append-only (prolongpointstru_append) functions. Parallelize shell-point interpolation table construction with #pragma omp parallel for collapse(3) and per-thread linked lists. Use static schedule for uniform workloads. Add OMP_FLAG = -fopenmp in makefile.inc and ShellPatch.o override rule in makefile for GCC OpenMP runtime (-lgomp already linked). Speedup: setupintintstuff ~2.2x faster on multi-core. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
74 lines
2.6 KiB
PHP
Executable File
74 lines
2.6 KiB
PHP
Executable File
## GCC version with OpenMPI and OpenBLAS
|
|
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
|
|
|
|
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
|
|
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
|
|
|
|
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
|
|
|
|
## OpenBLAS (OpenMP variant) + gfortran runtime
|
|
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
|
|
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
|
|
|
|
# OpenMP flag for selective compilation
|
|
OMP_FLAG = -fopenmp
|
|
|
|
## Memory allocator switch
|
|
## 0 (default) : use system default allocator (ptmalloc)
|
|
## 1 : use jemalloc (install jemalloc-devel first)
|
|
USE_JEMALLOC ?= 0
|
|
ifeq ($(USE_JEMALLOC),1)
|
|
LDLIBS := -ljemalloc $(LDLIBS)
|
|
endif
|
|
|
|
## Interp_Points load balance profiling mode
|
|
## off : (default) no load balance instrumentation
|
|
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
|
## optimize : Pass 2 — read profile and apply block rebalancing
|
|
INTERP_LB_MODE ?= off
|
|
|
|
ifeq ($(INTERP_LB_MODE),profile)
|
|
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
|
|
else ifeq ($(INTERP_LB_MODE),optimize)
|
|
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
|
|
else
|
|
INTERP_LB_FLAGS =
|
|
endif
|
|
|
|
## Kernel implementation switch
|
|
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
|
## 0 : fall back to original Fortran kernels
|
|
USE_CXX_KERNELS ?= 1
|
|
|
|
## Z4C Cartesian RHS kernel switch
|
|
## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
|
|
## 0 : use original Fortran Z4c_rhs.o
|
|
USE_CXX_Z4C_KERNELS ?= 1
|
|
|
|
## BSSN-EScalar RHS switch
|
|
## 1 (default) : use BSSN-EScalar C wrapper on the normal patch path
|
|
## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
|
|
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
|
|
USE_CXX_ESCALAR_KERNEL ?= 1
|
|
|
|
## Cached transfer switch
|
|
## auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
|
|
## 1 : force cached Sync/Restrict/OutBd transfer on evolution hot paths
|
|
## 0 : force the original uncached transfer path
|
|
USE_TRANSFER_CACHE ?= auto
|
|
|
|
## RK4 kernel implementation switch
|
|
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
|
|
## 0 : use original Fortran rungekutta4_rout.o
|
|
USE_CXX_RK4 ?= 1
|
|
|
|
f90 = gfortran
|
|
f77 = gfortran
|
|
CXX = g++
|
|
CC = gcc
|
|
CLINKER = mpicxx
|
|
|
|
Cu = nvcc
|
|
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
|
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|