AMSS-NCKU/AMSS_NCKU_source/makefile.inc

## GCC version with OpenMPI and OpenBLAS
OMPI_ROOT    = /usr/mpi/gcc/openmpi-4.1.9a1

## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)

filein  = -I/usr/include/ -I$(OMPI_ROOT)/include

## OpenBLAS (OpenMP variant) + gfortran runtime
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
LDLIBS  = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp

# OpenMP flag for selective compilation
OMP_FLAG = -fopenmp

## Memory allocator switch
##   0 (default) : use system default allocator (ptmalloc)
##   1           : use jemalloc (install jemalloc-devel first)
USE_JEMALLOC ?= 0
ifeq ($(USE_JEMALLOC),1)
LDLIBS := -ljemalloc $(LDLIBS)
endif

## Interp_Points load balance profiling mode
##   off        : (default) no load balance instrumentation
##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
##   optimize   : Pass 2 — read profile and apply block rebalancing
INTERP_LB_MODE ?= off

ifeq ($(INTERP_LB_MODE),profile)
INTERP_LB_FLAGS = -DINTERP_LB_PROFILE
else ifeq ($(INTERP_LB_MODE),optimize)
INTERP_LB_FLAGS = -DINTERP_LB_OPTIMIZE
else
INTERP_LB_FLAGS =
endif

## Kernel implementation switch
##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
##   0           : fall back to original Fortran kernels
USE_CXX_KERNELS ?= 1

## Z4C Cartesian RHS kernel switch
##   1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
##   0           : use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 1

## BSSN-EScalar RHS switch
##   1 (default) : use BSSN-EScalar C wrapper on the normal patch path
##   0           : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
USE_CXX_ESCALAR_KERNEL ?= 1

## Cached transfer switch
##   auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
##   1             : force cached Sync/Restrict/OutBd transfer on evolution hot paths
##   0             : force the original uncached transfer path
USE_TRANSFER_CACHE ?= auto

## RK4 kernel implementation switch
##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
##   0           : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1

f90          = gfortran
f77          = gfortran
CXX          = g++
CC           = gcc
CLINKER      = mpicxx

Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc