Fix BSSN C gauge RHS parity

Fix lower-order C lopsided boundary fallbacks
Fix eighth-order C derivative and lopsided stencils
2026-05-15 18:04:10 +08:00 · 2026-05-14 21:38:18 +08:00 · 2026-05-14 20:45:51 +08:00 · 2026-05-14 16:08:03 +08:00 · 2026-05-14 15:24:20 +08:00 · 2026-05-14 14:09:33 +08:00
2 changed files with 47 additions and 38 deletions
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -58,10 +58,14 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
 TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
 ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)

-## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
-##   make                        -> opt  (PGO-guided, maximum performance)
-##   make PGO_MODE=instrument    -> instrument (Phase 1: collect fresh profile data)

+## GCC build flags (optimized for x86-64-v4)
+## PGO disabled (used negative optimization on Intel; not tested on GCC)
+CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
+              -Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
+              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
+f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
+              -cpp $(POLINT6_FLAG)

 .SUFFIXES: .o .f90 .C .for .cu

@@ -69,11 +73,11 @@ ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KE
 	$(f90) $(f90appflags) -c $< -o $@

 .C.o:
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 # ShellPatch.C uses OpenMP for setupintintstuff search loops
 ShellPatch.o: ShellPatch.C
-	$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@

 .for.o:
 	$(f77) -c $< -o $@
@@ -83,59 +87,59 @@ ShellPatch.o: ShellPatch.C

 # C rewrite of BSSN RHS kernel and helpers
 bssn_rhs_c.o: bssn_rhs_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fderivs_c.o: fderivs_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fdderivs_c.o: fdderivs_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 kodiss_c.o: kodiss_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 lopsided_c.o: lopsided_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 lopsided_kodis_c.o: lopsided_kodis_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 # C rewrite of shell-patch derivative kernels
 fderivs_sh_c.o: fderivs_sh_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fdderivs_sh_c.o: fdderivs_sh_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fderivs_shc_c.o: fderivs_shc_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 fdderivs_shc_c.o: fdderivs_shc_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 kodiss_sh_c.o: kodiss_sh_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@


 bssn_em_rhs_c.o: bssn_em_rhs_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 z4c_rhs_c.o: z4c_rhs_c.C
-	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-#	$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
+#	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -fprofile-instr-use=$(TP_PROFDATA) \
-              -Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
+              -Dfortran3 -Dnewc $(filein_real)

 TwoPunctures.o: TwoPunctures.C
-	$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@

 TwoPunctureABE.o: TwoPunctureABE.C
-	$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
+	${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@

 # Input files

--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,21 +1,26 @@
-## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
+## GCC version with OpenMPI and OpenBLAS
+OMPI_ROOT    = /usr/mpi/gcc/openmpi-4.1.9a1

-## AOCL root path for includes and libraries
-AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
+## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
+export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)

-## AOCC-built OpenMPI prefix
-OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10
+filein  = -I/usr/include/ -I$(OMPI_ROOT)/include

-filein  = -I/usr/include/ -I$(AOCL_ROOT)/include
-
-## Using AOCL BLIS + libFLAME for BLAS/LAPACK
-## AOCC Fortran runtime: -lflang (includes FortranRuntime)
-## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
-LDLIBS  = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
+## OpenBLAS (OpenMP variant) + gfortran runtime
+## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
+LDLIBS  = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp

 # OpenMP flag for selective compilation
 OMP_FLAG = -fopenmp

+## Memory allocator switch
+##   0 (default) : use system default allocator (ptmalloc)
+##   1           : use jemalloc (install jemalloc-devel first)
+USE_JEMALLOC ?= 0
+ifeq ($(USE_JEMALLOC),1)
+LDLIBS := -ljemalloc $(LDLIBS)
+endif
+
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
 ##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
@@ -63,11 +68,11 @@ USE_TRANSFER_CACHE ?= auto
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1

-f90          = flang
-f77          = flang
-CXX          = clang++
-CC           = clang
-CLINKER      = $(OMPI_PREFIX)/bin/mpicxx
+f90          = gfortran
+f77          = gfortran
+CXX          = g++
+CC           = gcc
+CLINKER      = mpicxx

 Cu = nvcc
 CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
Author	SHA1	Message	Date
CGH0S7	12bf08a2a1	Fix BSSN C gauge RHS parity	2026-05-15 18:04:10 +08:00
CGH0S7	9b4f98e237	Fix lower-order C lopsided boundary fallbacks	2026-05-14 21:38:18 +08:00
CGH0S7	2bbde059db	Fix eighth-order C derivative and lopsided stencils	2026-05-14 20:45:51 +08:00
CGH0S7	3b8774c1b1	Fix C derivative ghost-buffer indexing across FD orders	2026-05-14 16:08:03 +08:00
CGH0S7	23b52e30d6	Fix fourth-order C lopsided and KO stencil indexing	2026-05-14 15:24:20 +08:00
CGH0S7	e8f590a742	Fix shell C kernel symbol names for Fortran linkage (fderivs_sh_ etc.) Shell C functions must export Fortran-compatible symbols with trailing underscore so bssn_rhs_ss.f90 and getnp4.f90 can link when WithShell is active and USE_CXX_SHELL_KERNELS=1 replaces Fortran diff_new_sh.o. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 14:09:33 +08:00
CGH0S7	632173ea10	Add full GAUGE 2-7 support to Z4C C RHS kernel (z4c_rhs_c.C) Previously only GAUGE 0 and 1 were supported with a compile error for 2-7. Now supports all 8 gauge choices matching BSSN Fortran formulas: - GAUGE 2: variable-eta gamma-driver, chi-sqrt denominator - GAUGE 3: variable-eta gamma-driver, chi-linear denominator - GAUGE 4: first-order variable-eta, chi-sqrt denominator - GAUGE 5: first-order variable-eta, chi-linear denominator - GAUGE 6: Jason's rational position-dependent damping - GAUGE 7: Jason's exponential position-dependent damping Also fixes dtSf advection/dissipation guards for gauges where dtSf is active. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 13:01:36 +08:00
CGH0S7	eed2ff2be8	Add C kernel for BSSN-EM (Maxwell/electromagnetic field) RHS computation New bssn_em_rhs_c.C computes EM field RHS (E,B,Kpsi,Kphi) and stress-energy tensor, then calls the C BSSN RHS kernel with source terms. Replaces empart.f90 when USE_CXX_EM_KERNEL=1. Supports all ghost_width orders via existing derivative kernels. Controlled by USE_CXX_EM_KERNEL switch (default 0, experimental). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 11:36:26 +08:00
CGH0S7	b904f6cf56	Add C implementations of shell-patch derivative kernels (WithShell support) New files provide C equivalents of Fortran diff_new_sh.f90 and kodiss_sh.f90: - fderivs_sh_c.C: first derivatives in shell (rho, sigma, R) coords - fdderivs_sh_c.C: second derivatives in shell coords - fderivs_shc_c.C: shell first derivs + chain rule to Cartesian - fdderivs_shc_c.C: shell second derivs + chain rule to Cartesian - kodiss_sh_c.C: Kreiss-Oliger dissipation on shell patches Also add symmetry_stbd() C implementation and shell fh indexing to share_func.h. Controlled by USE_CXX_SHELL_KERNELS switch (default 0, experimental). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 11:35:38 +08:00
CGH0S7	c4b9bd3788	Add full FD order support (2nd/4th/6th/8th) to C derivative kernels via ghost_width dispatch Wrap each C kernel in #if (ghost_width == N) blocks matching Fortran stencil coefficients from diff_new.f90, kodiss.f90, and lopsidediff.f90. Add fast-path indexing for ord=1,4,5 in share_func.h. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-14 11:33:52 +08:00
CGH0S7	276b36ea25	Add plot-only restart script to skip recomputation when plotting is interrupted Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-12 15:01:25 +08:00
CGH0S7	baf248c3bc	Add thread-safe ShellPatch::setupintintstuff with OpenMP Split prolongpointstru into search-only (prolongpointstru_search) and append-only (prolongpointstru_append) functions. Parallelize shell-point interpolation table construction with #pragma omp parallel for collapse(3) and per-thread linked lists. Use static schedule for uniform workloads. Add OMP_FLAG = -fopenmp in makefile.inc and ShellPatch.o override rule in makefile for GCC OpenMP runtime (-lgomp already linked). Speedup: setupintintstuff ~2.2x faster on multi-core. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-05-10 22:31:18 +08:00
CGH0S7	70b6496ed3	Accelerate Shell-Patch CPU interpolation	2026-05-08 14:37:16 +08:00
CGH0S7	6ca9fece2e	Add OpenMPI rpath and LD_LIBRARY_PATH export for reliable linking - Export OMPI_ROOT/lib64 in LD_LIBRARY_PATH so mpicxx finds its runtime libs - Add -Wl,-rpath to embed OpenMPI lib64 path in executables for runtime - Replace hardcoded paths with OMPI_ROOT variable Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-04-28 22:43:57 +08:00
CGH0S7	516cdea502	Replace MKL with OpenBLAS - TwoPunctures.C: <mkl_cblas.h> → <cblas.h> - gaussj.C: <mkl_lapacke.h> → <lapacke.h> - makefile.inc: use -lopenblaso, remove MKLROOT dependency - makefile: remove -I${MKLROOT}/include from all flag variables - Add OpenMPI include path to filein (needed since g++ is used for .C compilation, not the mpicxx wrapper) Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-04-28 22:33:43 +08:00
CGH0S7	9687d9a3dd	Switch build system from Intel oneAPI to GCC + OpenMPI - Replace compilers: ifx→gfortran, icx→gcc, icpx→g++, mpiicpx→mpicxx - Replace flags: -xHost→-march=x86-64-v4, -ipo→-flto, -fpp→-cpp - Replace flags: -fp-model fast=2→-ffast-math, -fma→-mfma - Replace flags: -qopenmp→-fopenmp - Remove Intel-specific: -align array64byte, -liomp5, -lifcore, -limf - Switch MKL interface: -lmkl_intel_lp64→-lmkl_gf_lp64 (gfortran) - Replace TBB malloc with optional jemalloc (default off) - Disable PGO entirely (was already marked negative optimization) - TwoPunctureABE and ABE both verified to build successfully Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>	2026-04-28 22:00:58 +08:00