

include makefile.inc

## polint(ordn=6) kernel selector:
##   1 (default): barycentric fast path
##   0          : fallback to Neville path
POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)

## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
##   make                        -> opt  (PGO-guided, maximum performance)
##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata

ifeq ($(TOOLCHAIN),intel)
OMP_FLAG = -qopenmp

ifeq ($(PGO_MODE),instrument)
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -march=znver5  -fma -fprofile-instr-generate -ipo \
              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -march=znver5  -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization


CXXAPPFLAGS = -O3 -march=znver5  -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -march=znver5  -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
endif

TP_OPTFLAGS = -O3 -march=znver5  -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc $(MKL_INC)
else
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
## PGO_MODE is ignored in this branch.
OMP_FLAG = -mp
CXXAPPFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
              -Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -march=znver5 -tp=host -Mcache_align -Mfma -Mpreprocess \
              $(MKL_INC) $(POLINT6_FLAG)
TP_OPTFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
              -Dfortran3 -Dnewc $(MKL_INC)
endif

.SUFFIXES: .o .f90 .C .for .cu

.f90.o:
	$(f90) $(f90appflags) -c $< -o $@

.C.o:
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

# ShellPatch.C uses OpenMP for setupintintstuff search loops
ShellPatch.o: ShellPatch.C
	${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@

.for.o:
	$(f77) -c $< -o $@

.cu.o:
	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)

# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h fd_cuda_helpers.cuh
	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)

# CUDA rewrite of BSSN Shell-Patch RHS (drop-in replacement for bssn_rhs_ss)
bssn_gpu_rhs_ss.o: bssn_gpu_rhs_ss.cu bssn_gpu.h gpu_rhsSS_mem.h bssn_macro.h macrodef.fh
	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)

# CUDA rewrite of Z4C Cartesian RHS
z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h fd_cuda_helpers.cuh
	$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)

# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

fderivs_c.o: fderivs_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

fdderivs_c.o: fdderivs_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

kodiss_c.o: kodiss_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

lopsided_c.o: lopsided_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

lopsided_kodis_c.o: lopsided_kodis_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

z4c_rhs_c.o: z4c_rhs_c.C
	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
#	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

TwoPunctures.o: TwoPunctures.C
	${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@

TwoPunctureABE.o: TwoPunctureABE.C
	${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@

# Input files

## CUDA BSSN RHS switch
##   1 : use the rewritten CUDA bssn_rhs backend
##   0 : keep the normal CPU/Fortran selection below
USE_CUDA_BSSN ?= 0
USE_CUDA_Z4C ?= 0
AMSS_Z4C_MRBD ?= 0

CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
CXXAPPFLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD)
CUDA_APP_FLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD)

## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
CFILES_CPU =
else
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
endif

CFILES_CUDA_BSSN = bssn_rhs_cuda.o bssn_gpu_rhs_ss.o

ifeq ($(USE_CUDA_BSSN),1)
CFILES = $(CFILES_CUDA_BSSN)
else
CFILES = $(CFILES_CPU)
endif

ifeq ($(USE_CUDA_Z4C),1)
CFILES += z4c_rhs_cuda.o
Z4C_F90_OBJ =
else ifeq ($(USE_CXX_Z4C_KERNELS),1)
CFILES += z4c_rhs_c.o
Z4C_F90_OBJ =
else
Z4C_F90_OBJ = Z4c_rhs.o
endif

## RK4 kernel switch (independent from USE_CXX_KERNELS)
ifeq ($(USE_CXX_RK4),1)
RK4_C_OBJ = rungekutta4_rout_c.o
RK4_F90_OBJ =
else
RK4_C_OBJ =
RK4_F90_OBJ = rungekutta4_rout.o
endif

CFILES += $(RK4_C_OBJ)
ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) z4c_rhs_cuda.o $(RK4_C_OBJ)

ABE_LDLIBS = $(LDLIBS)
ifeq ($(USE_CUDA_BSSN),1)
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
endif
ifeq ($(USE_CUDA_Z4C),1)
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
endif

C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
	   
#C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           cgh.o surface_integral.o ShellPatch.o\
	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
	   NullShellPatch2_Evo.o \
	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o

F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
	   prolongrestrict_cell.o prolongrestrict_vertex.o\
	   $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
	   lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
	   shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
           getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
           fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\
	   cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
	   getnpem2.o empart.o NullNews.o fourdcurvature.o\
	   bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
	   scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\
	   NullNews2.o tool_f.o

ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: include original bssn_rhs.o
F90FILES = $(F90FILES_BASE) bssn_rhs.o
else
# C++ mode (default): bssn_rhs.o replaced by C++ kernel
F90FILES = $(F90FILES_BASE)
endif

F77FILES = zbesh.o

AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \
tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \
linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \
initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o

TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 

#CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o

# file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh

$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
	     rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\
	     cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
             initial_null2.h NullShellPatch2.h 
             
#$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
	     misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
	     rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\
	     cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
             fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\
	     NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\
	     empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
             initial_null2.h NullShellPatch2.h \
             bssn_gpu_class.h bssn_macro.h
             
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h

$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h

TwoPunctureFILES: TwoPunctures.h

$(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h

misc.o : zbesh.o

# projects
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)

ABE_CUDA: USE_CUDA_BSSN=1
ABE_CUDA: USE_CUDA_Z4C=1
ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
	
#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
#	$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)

TwoPunctureABE: $(TwoPunctureFILES)
	$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)

clean:
	rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
