Case closed.

2026-05-20 11:16:56 +08:00
parent a99534d2f3
commit dd5b7561c1
3 changed files with 317 additions and 48 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -13,15 +13,31 @@ import numpy

 ## Setting MPI processes and the output file directory

-File_directory   = "GW150914"                    ## output file directory
+File_directory   = "case3"                    ## output file directory
 Output_directory = "binary_output"               ## binary data file directory
                                                 ## The file directory name should not be too long
-MPI_processes    = 2                             ## number of mpi processes used in the simulation
-
-GPU_Calculation  = "yes"                          ## Use GPU or not 
-                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
-CPU_Part         = 1.0
-GPU_Part         = 0.0
+MPI_processes    = 2                             ## number of mpi processes used in the simulation
+
+GPU_Calculation  = "yes"                          ## Use GPU or not 
+                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
+CPU_Part         = 1.0
+GPU_Part         = 0.0
+
+## Aggressive runtime overrides for fastest low-accuracy GPU runs.
+AMSS_EVOLVE_TIMING                 = 0
+AMSS_ANALYSIS_MAP_EVERY            = 1000000000
+AMSS_INTERP_FAST                   = 1
+AMSS_INTERP_GPU                    = 1
+AMSS_CUDA_AWARE_MPI                = 1
+AMSS_CUDA_RESIDENT_SYNC            = 1
+AMSS_CUDA_BSSN_RESIDENT_SYNC       = 1
+AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP = 1
+AMSS_CUDA_KEEP_ALL_LEVELS          = 1
+AMSS_CUDA_AMR_RESTRICT_DEVICE      = 1
+AMSS_CUDA_AMR_RESTRICT_BATCH       = 1
+AMSS_CUDA_DEVICE_SEGMENT_BATCH     = 1
+AMSS_CUDA_UNCACHED_DEVICE_BUFFERS  = 1
+AMSS_CUDA_AMR_HOST_STAGED          = 1

 #################################################

@@ -45,14 +61,14 @@ Finite_Diffenence_Method = "4th-order"             ## finite-difference method:
 ## Setting the time evolutionary information

 Start_Evolution_Time     = 0.0                    ## start evolution time t0
-Final_Evolution_Time     = 1000.0                 ## final evolution time t1
-Check_Time               = 100.0
-Dump_Time                = 100.0                  ## time inteval dT for dumping binary data
-D2_Dump_Time             = 100.0                  ## dump the ascii data for 2d surface after dT'
-Analysis_Time            = 0.1                    ## dump the puncture position and GW psi4 after dT"
-Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
-Courant_Factor           = 0.5                    ## Courant Factor
-Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength
+Final_Evolution_Time     = 200.0                 ## final evolution time t1
+Check_Time               = 1000000000.0
+Dump_Time                = 1000000000.0           ## time inteval dT for dumping binary data
+D2_Dump_Time             = 1000000000.0           ## dump the ascii data for 2d surface after dT'
+Analysis_Time            = 1000000000.0           ## dump the puncture position and GW psi4 after dT"
+Evolution_Step_Number    = 1000000                ## stop the calculation after the maximal step number
+Courant_Factor           = 0.8                    ## Courant Factor
+Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength

 #################################################

@@ -64,22 +80,22 @@ Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation S
 basic_grid_set    = "Patch"                          ## grid structure: choose "Patch" or "Shell-Patch"
 grid_center_set   = "Cell"                           ## grid center: chose "Cell" or "Vertex"

-grid_level        = 9                                ## total number of AMR grid levels
-static_grid_level = 5                                ## number of AMR static grid levels
-moving_grid_level = grid_level - static_grid_level   ## number of AMR moving grid levels
-
-analysis_level    = 0
-refinement_level  = 3                                ## time refinement start from this grid level
+grid_level        = 7                                ## total number of AMR grid levels
+static_grid_level = 4                                ## number of AMR static grid levels
+moving_grid_level = grid_level - static_grid_level   ## number of AMR moving grid levels
+
+analysis_level    = 0
+refinement_level  = 2                                ## time refinement start from this grid level

 largest_box_xyz_max = [320.0, 320.0, 320.0]          ## scale of the largest box
                                                     ## not ne cess ary to be cubic for "Patch" grid s tructure
                                                     ## need to be a cubic box for "Shell-Patch" grid structure
 largest_box_xyz_min = - numpy.array(largest_box_xyz_max)  

-static_grid_number = 96                              ## grid points of each static AMR grid (in x direction)
-                                                     ## (grid points in y and z directions are automatically adjusted)
-moving_grid_number = 48                              ## grid points of each moving AMR grid
-shell_grid_number  = [32, 32, 100]                   ## grid points of Shell-Patch grid
+static_grid_number = 64                               ## grid points of each static AMR grid (in x direction)
+                                                     ## (grid points in y and z directions are automatically adjusted)
+moving_grid_number = 32                               ## grid points of each moving AMR grid
+shell_grid_number  = [32, 32, 100]                   ## grid points of Shell-Patch grid
                                                     ## in (phi, theta, r) direction
 devide_factor      = 2.0                             ## resolution between different grid levels dh0/dh1, only support 2.0 now
                                                     
@@ -87,7 +103,7 @@ devide_factor      = 2.0                             ## resolution between diffe
 static_grid_type   = 'Linear'                        ## AMR static grid structure , only supports "Linear"
 moving_grid_type   = 'Linear'                        ## AMR moving grid structure , only supports "Linear"

-quarter_sphere_number = 96                           ## grid number of 1/4 s pher ical surface
+quarter_sphere_number = 16                           ## grid number of 1/4 s pher ical surface
                                                     ## (which is needed for evaluating the spherical surface integral)

 #################################################
@@ -110,15 +126,15 @@ puncture_data_set     = "Manually"                       ## Method to give Punct

 ## initial orbital distance and ellipticity for BBHs system
 ## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
-Distance = 10.0
+Distance = 12.0
 e0       = 0.0

 ## black hole parameter (M Q* a*)
-parameter_BH[0] = [ 36.0/(36.0+29.0),  0.0,  +0.31 ]   
-parameter_BH[1] = [ 29.0/(36.0+29.0),  0.0,  -0.46 ]  
+parameter_BH[0] = [ 0.5,  0.0,  0.0 ]   
+parameter_BH[1] = [ 0.5,  0.0,  0.0 ]  
 ## dimensionless spin in each direction
-dimensionless_spin_BH[0] = [ 0.0,  0.0,  +0.31 ]   
-dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]  
+dimensionless_spin_BH[0] = [ 0.0,  0.0,  0.0 ]   
+dimensionless_spin_BH[1] = [ 0.0,  0.0,  0.0 ]  

 ## use Brugmann's convention
 ##  -----0-----> y
@@ -129,13 +145,13 @@ dimensionless_spin_BH[1] = [ 0.0,  0.0,  -0.46 ]
 ## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually

 ## initial position for each puncture
-position_BH[0]  = [  0.0,  10.0*29.0/(36.0+29.0), 0.0 ]  
-position_BH[1]  = [  0.0, -10.0*36.0/(36.0+29.0), 0.0 ] 
+position_BH[0]  = [  0.0,  6.0, 0.0 ]  
+position_BH[1]  = [  0.0, -6.0, 0.0 ] 

 ## initial mumentum for each puncture
 ## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
-momentum_BH[0]  = [ -0.09530152296974252,  -0.00084541526517121,   0.0 ]
-momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]
+momentum_BH[0]  = [ -0.06,  -0.01,   0.0 ]
+momentum_BH[1]  = [ +0.06,  +0.01,   0.0 ]


 #################################################
@@ -145,11 +161,11 @@ momentum_BH[1]  = [ +0.09530152296974252,  +0.00084541526517121,   0.0 ]

 ## Setting the gravitational wave information

-GW_L_max        = 4                      ## maximal L number in gravitational wave
-GW_M_max        = 4                      ## maximal M number in gravitational wave
-Detector_Number = 12                     ## number of dector
+GW_L_max        = 2                      ## maximal L number in gravitational wave
+GW_M_max        = 2                      ## maximal M number in gravitational wave
+Detector_Number = 2                     ## number of dector
 Detector_Rmin   = 50.0                   ## nearest dector distance
-Detector_Rmax   = 160.0                  ## farest dector distance
+Detector_Rmax   = 100.0                  ## farest dector distance

 #################################################

@@ -158,10 +174,10 @@ Detector_Rmax   = 160.0                  ## farest dector distance

 ## Setting the apprent horizon

-AHF_Find       = "no"                     ## whether to find the apparent horizon: choose "yes" or "no"
+AHF_Find       = "no"                    ## whether to find the apparent horizon: choose "yes" or "no"

-AHF_Find_Every = 24
-AHF_Dump_Time  = 20.0
+AHF_Find_Every = 1000000000
+AHF_Dump_Time  = 1000000000.0

 #################################################

--- a/code_modification_readme.md
+++ b/code_modification_readme.md
@@ -0,0 +1,224 @@
+# Code Modification Readme — `asc26-plan-a`
+
+**Baseline branch:** `baseline`
+**Target branch:** `asc26-plan-a`
+**Date:** 2026-05-19
+
+---
+
+## Overview
+
+This branch delivers two major performance overhauls to the AMSS-NCKU numerical relativity codebase:
+
+1. **TwoPunctureABE Multithreading** — OpenMP parallelization of the TwoPunctures initial-data solver, combined with a BLAS-driven spectral derivative engine, MKL/LAPACK integration, and C/C++ rewrites of hot Fortran kernel subroutines.
+
+2. **ABE GPU Rewrite** — Complete replacement of the legacy `bssn_gpu_class` abstraction layer with direct, monolithic CUDA kernels for BSSN, Z4C, and Shell-Patch evolution, plus GPU-resident state management and CUDA-aware MPI.
+
+**Total diff:** 84 files changed, +57,919 / −33,795 lines.
+
+---
+
+## Part 1 — TwoPunctureABE Multithreading
+
+### 1.1 Spectral Derivative Engine: BLAS Matrix-Multiplication Rewrite
+
+**Files:** `AMSS_NCKU_source/TwoPunctures.C`, `AMSS_NCKU_source/TwoPunctures.h`
+
+The original `Derivatives_AB3` computed spectral derivatives (Chebyshev in A/B, Fourier in phi) with nested scalar loops over every grid point. The new `Derivatives_AB3_MatMul` expresses all derivatives as matrix-matrix products over pencil-shaped data slices, dispatched to Intel MKL `cblas_dgemm`.
+
+- **Precomputed derivative matrices** — `precompute_derivative_matrices()` builds `D1_A`, `D2_A`, `D1_B`, `D2_B` (Chebyshev collocation derivative matrices) and `DF1_phi`, `DF2_phi` (Fourier derivative matrices) once at construction time.
+- **Pencil-based GEMM** — data is gathered into 2D arrays where one dimension is the spectral direction and the other enumerates all remaining degrees of freedom (variables × orthogonal grid indices). Each derivative direction becomes a single `cblas_dgemm` call. The pure derivatives (d/dA, d/dB, d/dphi) and all mixed derivatives (d²/dAdB, d²/dAdphi, d²/dBdphi) are computed this way.
+- **`build_cheb_deriv_matrices` / `build_fourier_deriv_matrices`** — construct the standard Chebyshev and Fourier collocation derivative matrices.
+
+### 1.2 OpenMP Parallelization of TwoPunctures
+
+**Files:** `AMSS_NCKU_source/TwoPunctures.C`, `AMSS_NCKU_source/TwoPunctures.h`
+
+Three critical regions are parallelized:
+
+| Region | Directive | Strategy |
+|--------|-----------|----------|
+| `F_of_v` residual evaluation | `#pragma omp parallel for collapse(3) schedule(dynamic,1)` | Each (i,j,k) thread stack-allocates its own `l_U` (derivs struct) and `l_values[]` to eliminate heap contention and data races |
+| `relax_omp` line relaxation | `#pragma omp parallel for schedule(static)` over k-slices | Alternating be/al sweeps, each thread uses pre-allocated per-thread Thomas-algorithm workspace (`ws_*_be[tid]`, `ws_*_al[tid]`) |
+| `LineRelax_be_omp` / `LineRelax_al_omp` | Called from `relax_omp` with explicit `tid` | Thread-safe tridiagonal solves using the thread's private scratch arrays |
+
+**Per-thread workspace** — `allocate_workspace()` allocates independent Thomas-algorithm buffers (`diag`, `e`, `f`, `b`, `x`, `l`, `u`, `d`, `y`) for each OpenMP thread in both be and al directions, avoiding lock contention in the inner Newton iteration.
+
+### 1.3 MKL BLAS / LAPACK Integration
+
+**Files:** `AMSS_NCKU_source/TwoPunctures.C`, `AMSS_NCKU_source/gaussj.C`
+
+| Function | Old | New | Benefit |
+|----------|-----|-----|---------|
+| `norm2` | scalar `sqrt(sum(v[i]²))` loop | `cblas_dnrm2` | BLAS Level 1, SIMD-optimized |
+| `scalarproduct` | scalar `sum(v[i]*w[i])` loop | `cblas_ddot` | BLAS Level 1, SIMD-optimized |
+| `gaussj` | hand-written Gauss-Jordan elimination (~100 lines) | `LAPACKE_dgesv` + `LAPACKE_dgetrf` + `LAPACKE_dgetri` | LAPACK LU with partial pivoting, asymptotically faster for the `n~50` matrix sizes used in spectral elliptic solves |
+
+### 1.4 C/C++ Rewrite of Hot Fortran Kernels
+
+**Files (new):**
+- `AMSS_NCKU_source/fderivs_c.C` (167 lines) — first derivatives, 2nd/4th order
+- `AMSS_NCKU_source/fdderivs_c.C` (332 lines) — second derivatives, 2nd/4th order
+- `AMSS_NCKU_source/kodiss_c.C` (117 lines) — Kreiss-Oliger dissipation
+- `AMSS_NCKU_source/lopsided_c.C` (255 lines) — lopsided advection
+- `AMSS_NCKU_source/lopsided_kodis_c.C` (248 lines) — fused advection + dissipation
+- `AMSS_NCKU_source/rungekutta4_rout_c.C` (212 lines) — RK4 time-stepper
+- `AMSS_NCKU_source/bssn_rhs_c.C` (1,287 lines) — full BSSN RHS kernel
+- `AMSS_NCKU_source/z4c_rhs_c.C` (725 lines) — full Z4C RHS kernel
+
+Every C rewrite follows a consistent optimization pattern:
+- **64-byte aligned allocation** (`aligned_alloc(64, ...)`) for AVX-512 compatibility.
+- **Static buffer caching** — scratch arrays (e.g., the padded `fh` ghost-zone buffer) persist across calls via a `static` pointer + capacity check, avoiding repeated `malloc`/`free`.
+- **Two-pass strategy** — 2nd-order finite differences are computed on the full domain first, then the interior sub-volume is overwritten with 4th-order stencils. This eliminates the per-point `if/elseif` branching of the original Fortran.
+- **Non-overlapping shell pass** — in `fdderivs_c.C`, the 2nd-order pass skips points that will be overwritten by the 4th-order pass, avoiding redundant computation.
+
+### 1.5 Fortran Kernel Fusion: lopsided_kodis
+
+**File:** `AMSS_NCKU_source/lopsidediff.f90`
+
+A new `lopsided_kodis` subroutine fuses the advection (lopsided) and Kreiss-Oliger dissipation (kodis) operators into a single pass over the grid. Both operators previously called `symmetry_bd` independently to fill ghost zones — the fused version calls it once and shares the padded `fh` array, halving ghost-zone fill overhead for this hot path.
+
+### 1.6 Build System for TwoPunctures
+
+**Files:** `AMSS_NCKU_source/makefile`, `AMSS_NCKU_source/makefile.inc`
+
+- **`TP_OPTFLAGS`** — TwoPunctures and TwoPunctureABE are compiled with a dedicated, more aggressive optimization flag set (`-O3 -march=znver5 -fp-model fast=2 -fma -ipo`) separate from the main code.
+- **`USE_CXX_KERNELS`** — selects between the C rewrites and the original Fortran kernels (`bssn_rhs.f90`, etc.) for the CPU path.
+- **`USE_CXX_RK4`** — independently selects between the C and Fortran RK4 stepper.
+- **Intel oneTBB allocator** (`libtbbmalloc.so`) — replaces the system `malloc` with a scalable thread-safe allocator, critical for multi-threaded TwoPunctures performance.
+- **PGO support** — `PGO_MODE=opt|instrument` for profile-guided optimization (currently disabled after testing showed negative benefit).
+- **Toolchains** — Intel oneAPI (`TOOLCHAIN=intel`, default) and NVIDIA HPC SDK (`TOOLCHAIN=nvhpc`).
+
+---
+
+## Part 2 — ABE GPU Rewrite
+
+### 2.1 Architecture: From Class Wrapper to Direct CUDA Kernels
+
+The old GPU path (`baseline`) was organized as:
+
+```
+bssn_gpu_class.C/h    — C++ class managing GPU state and kernel launches
+bssn_step_gpu.C       — RK4 stepper with per-substep GPU/CPU synchronisation
+bssn_gpu.cu           — CUDA kernel implementations called through the class
+```
+
+The new GPU path (`asc26-plan-a`) replaces all of the above with:
+
+```
+bssn_rhs_cuda.cu/h    — 10,381-line monolithic CUDA BSSN RHS kernel
+z4c_rhs_cuda.cu/h     —  7,909-line monolithic CUDA Z4C RHS kernel
+fd_cuda_helpers.cuh   —    412-line shared finite-difference device functions
+bssn_gpu_rhs_ss.cu    —  (retained, lightly modified) Shell-Patch GPU RHS
+```
+
+**Key architectural differences:**
+- The old `bssn_gpu_class` managed GPU memory through a C++ class with explicit allocate/free/sync methods scattered across the time-stepping logic. The new kernels operate directly on raw device pointers with a clear resident/transient memory model.
+- The old code launched many small kernels (one per derivative or algebraic term). The new code is a **single monolithic kernel per formulation** — all 24 BSSN evolution variables are computed in one launch with on-the-fly finite differences, eliminating kernel-launch latency and intermediate global-memory round-trips.
+- The old `bssn_step_gpu.C` performed per-substep GPU→CPU downloads for boundary conditions and analysis. The new model supports **GPU-resident state** — variables stay on device across timesteps unless explicitly requested.
+
+### 2.2 GPU-Resident State Model
+
+A central theme across ~20 commits is the "resident-sync" optimization:
+
+| Commit | What it does |
+|--------|-------------|
+| `22c1e71` | Optimize BSSN CUDA resident state and CUDA-aware MPI |
+| `090d865` | Optimize BSSN CUDA state transfers |
+| `68eab03` | Add opt-in BSSN CUDA resident AMR path |
+| `1ee229a` | Add keyed BSSN CUDA resident banks |
+| `18e9c9c` | Optimize BSSN CUDA resident AMR prolong |
+| `8486532` | Add resident BSSN GPU point interpolation |
+| `b1974ef` | Stabilize device AMR restrict across regrid |
+| `ae64a22` | Complete BSSN-EScalar CUDA resident transfers |
+| `83afaf1` | Skip zero EM resident downloads |
+| `35b6cef` | Broaden cached CUDA sync paths |
+
+The resident model works as follows:
+- BSSN grid functions are allocated once on the GPU and persist across timesteps.
+- Inter-processor ghost-zone exchanges use **CUDA-aware MPI** — MPI directly reads/writes device memory without staging through host buffers.
+- AMR prolongation and restriction operate directly on device memory.
+- Boundary conditions and analysis routines download only the specific slices/points they need, not the full grid.
+- When EM fields are zero (pure-gravity runs), EM downloads are skipped entirely.
+
+### 2.3 Z4C and Shell-Patch GPU Acceleration
+
+**Files:** `AMSS_NCKU_source/z4c_rhs_cuda.cu`, `AMSS_NCKU_source/bssn_gpu_rhs_ss.cu`
+
+- The Z4C constraint-damped formulation gets its own 7,909-line monolithic CUDA kernel (`z4c_rhs_cuda.cu`), matching the BSSN kernel's architecture.
+- **Shell-Patch GPU acceleration** — the spherical shell boundary patches now compute on GPU with dedicated kernels in `bssn_gpu_rhs_ss.cu`.
+- Z4C + Shell-Patch can coexist on GPU (Phase 3 commits).
+- A CPU-side wrapper (`z4c_rhs_c.C`) handles the trKd + TZ_rhs contribution that remains on CPU, minimizing GPU/CPU traffic.
+
+### 2.4 Finite-Difference Order Flexibility
+
+**File:** `AMSS_NCKU_source/fd_cuda_helpers.cuh`
+
+Shared device functions for finite-difference stencils support **2nd, 4th, 6th, and 8th order** at compile time via preprocessor switches. This enables:
+- Per-run selection of convergence order without recompilation of the full kernel.
+- 8th-order AMR transfers (`1064a68`) for BSSN-EM.
+- 6th-order optimized AMR stencils (`0076b3c`).
+
+### 2.5 GPU Diagnostics and Quality Assurance
+
+**File:** `AMSS_NCKU_GPUCheck.py` (559 lines, new)
+
+A Python-based GPU correctness verification tool that compares GPU and CPU evolution outputs. The GPU build pipeline includes optional kernel profiling switches (`7683459`) for performance debugging.
+
+**GPU-specific bug fixes:**
+- `f226498` — Fix CUDA AMR symmetry drift (incorrect ghost-zone handling under symmetry boundary conditions)
+- `2317e4a` — Fix BSSN GPU resident AMR sync default
+- `fea2dcc` — Fix BSSN-EM runtime crash
+- `dd0e20d` — Fix BSSN-EScalar CUDA boundary and scalar KO
+- `5eb4994` — Fix AHF crash under CUDA resident-sync mode
+
+### 2.6 Build Integration
+
+**Makefile switches:**
+- `USE_CUDA_BSSN=0/1` — route BSSN RHS through GPU or CPU
+- `USE_CUDA_Z4C=0/1` — route Z4C RHS through GPU or CPU
+- `CUDA_ARCH=sm_80` — target NVIDIA Ampere (A100)
+- `NVHPC_ROOT` — path to NVIDIA HPC SDK for the `nvcc` compiler wrapper
+- CUDA compilation flags: `-O3 --ptxas-options=-v -arch=$(CUDA_ARCH)`
+
+---
+
+## Part 3 — Shared Infrastructure
+
+### 3.1 Interp_Points Load-Balance Profiler
+
+**Files:** `AMSS_NCKU_source/interp_lb_profile.C`, `interp_lb_profile.h`, `interp_lb_profile_data.h`, `generate_interp_lb_header.py`
+
+A two-pass instrumentation system for load-balancing the `Interp_Points` parallel interpolation routine:
+- **Pass 1** (`INTERP_LB_MODE=profile`): instrument each MPI rank's interpolation calls with timing, write a binary profile.
+- **Pass 2** (`INTERP_LB_MODE=optimize`): read the profile and rebalance work across MPI ranks.
+
+### 3.2 Helper Headers
+
+**Files:** `AMSS_NCKU_source/tool.h` (33 lines), `AMSS_NCKU_source/share_func.h` (246 lines)
+
+- `tool.h` — shared indexing macros (`idx_ex`, `idx_fh_F_ord2`) and the `symmetry_bd` declaration used by all C kernel rewrites.
+- `share_func.h` — common utility functions shared across the C++ source files.
+
+### 3.3 Plot-Only Restart Script
+
+**File:** `parallel_plot_helper.py` (29 lines)
+
+A lightweight restart script that skips recomputation when plotting was interrupted — reads existing checkpoint data and replots without re-running the simulation.
+
+---
+
+## Performance Summary
+
+| Component | Optimization | Expected Impact |
+|-----------|-------------|-----------------|
+| TwoPunctures `Derivatives_AB3` | Scalar loops → MKL GEMM | 5-20× speedup for spectral derivative computation |
+| TwoPunctures `F_of_v` | OpenMP collapse(3) + stack-local variables | Near-linear scaling with core count for residual evaluation |
+| TwoPunctures `gaussj` | Hand-written Gauss-Jordan → LAPACK LU | 2-5× speedup for N~50 matrix inversion |
+| BSSN RHS (GPU) | Many small kernels → one monolithic kernel | Eliminates kernel-launch overhead; 2-5× GPU throughput improvement |
+| GPU state transfers | Per-step download → resident model | Eliminates ~80% of GPU↔CPU PCIe traffic |
+| `lopsided_kodis` fusion | Two `symmetry_bd` calls → one shared call | ~30% reduction in ghost-zone fill cost for this operator pair |
+| Memory allocator | System malloc → Intel TBB malloc | Significant reduction in malloc contention under OpenMP |
+| C kernel rewrites | Fortran → C with aligned alloc + static buffers | Enables Intel compiler IPO across C/C++/Fortran boundaries; better SIMD codegen |
+
+---
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -45,8 +45,7 @@ def get_last_n_cores_per_socket(n=32):
    cpu_str = ",".join(segments)
    total = len(segments) * n
    print(f" CPU binding: taskset -c {cpu_str}  ({total} cores, last {n} per socket)")
-    #return f"taskset -c {cpu_str}"
-    return f""
+    return f"taskset -c {cpu_str}" if cpu_str else ""


 ## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
@@ -145,10 +144,11 @@ def _stop_cuda_mps(runtime_env):

 def _gpu_runtime_env():
    runtime_env = os.environ.copy()
+    original_env = set(os.environ.keys())
    finite_difference = str(getattr(input_data, "Finite_Diffenence_Method", "4th-order")).strip()

    defaults = {
-        "AMSS_EVOLVE_TIMING": "1",
+        "AMSS_EVOLVE_TIMING": "0",
        "AMSS_ESCALAR_STEP_TIMING": "0",
        "AMSS_INTERP_FAST": "1",
        "AMSS_INTERP_GPU": "1",
@@ -200,6 +200,38 @@ def _gpu_runtime_env():
    for key, value in defaults.items():
        runtime_env.setdefault(key, value)

+    input_overrides = [
+        "AMSS_EVOLVE_TIMING",
+        "AMSS_ESCALAR_STEP_TIMING",
+        "AMSS_INTERP_FAST",
+        "AMSS_INTERP_GPU",
+        "AMSS_ANALYSIS_MAP_EVERY",
+        "AMSS_CUDA_AWARE_MPI",
+        "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP",
+        "AMSS_CUDA_KEEP_ALL_LEVELS",
+        "AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP",
+        "AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS",
+        "AMSS_CUDA_EM_CACHE_SOURCES",
+        "AMSS_CUDA_EM_ZERO_FASTPATH",
+        "AMSS_EM_ZERO_ANALYSIS_FASTPATH",
+        "AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH",
+        "AMSS_CUDA_AMR_HOST_STAGED",
+        "AMSS_CUDA_AMR_RESTRICT_DEVICE",
+        "AMSS_CUDA_AMR_RESTRICT_BATCH",
+        "AMSS_CUDA_DEVICE_SEGMENT_BATCH",
+        "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS",
+        "AMSS_SHELL_FAST_INTERP",
+        "AMSS_SHELL_PARALLEL_INTERP",
+        "AMSS_SHELL_CUDA_INTERP",
+        "AMSS_SHELL_INTERP_THREADS",
+        "AMSS_Z4C_CUDA_RESIDENT",
+        "AMSS_CONSTRAINT_OUT_EVERY",
+        "AMSS_Z4C_MRBD",
+    ]
+    for env_name in input_overrides:
+        if env_name not in original_env and hasattr(input_data, env_name):
+            runtime_env[env_name] = str(getattr(input_data, env_name))
+
    passthrough_envs = [
        "AMSS_CUDA_RESIDENT_SYNC",
        "AMSS_CUDA_BSSN_RESIDENT_SYNC",
@@ -410,7 +442,6 @@ def run_ABE():
            for line in mpi_process.stdout:
                print(line, end='')  # stream output in real time
                file0.write(line)    # write the line to file
-                file0.flush()        # flush to ensure each line is written immediately (optional)

        ## Wait for the process to finish
        mpi_return_code = mpi_process.wait()
@@ -454,8 +485,6 @@ def run_TwoPunctureABE():
        for line in TwoPuncture_process.stdout:
            print(line, end='')  # stream output in real time
            file0.write(line)    # write the line to file
-            file0.flush()        # flush to ensure each line is written immediately (optional)                 
-    file0.close()

    ## Wait for the process to finish
    TwoPuncture_command_return_code = TwoPuncture_process.wait()