Batch symbd_pack/lopsided/kodiss over all state variables

2026-04-13 11:02:55 +08:00
parent 1b3c0b80d2
commit c49a4e00c9
1 changed files with 483 additions and 89 deletions
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -12,6 +12,7 @@
 #include <cmath>
 #include <cstring>
 #include <unordered_map>
+#include <vector>
 #include <cuda_runtime.h>
 #include "macrodef.h"
 #include "bssn_rhs.h"
@@ -196,6 +197,33 @@ __device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) {
    return (iF + 2) + (jF + 2) * d_gp.fh3_nx + (kF + 2) * d_gp.fh3_nx * d_gp.fh3_ny;
 }

+__device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
+                                                        int iF, int jF, int kF,
+                                                        int SoA0, int SoA1, int SoA2)
+{
+    int siF = iF;
+    int sjF = jF;
+    int skF = kF;
+    double sign = 1.0;
+
+    if (iF <= 0) {
+        siF = 1 - iF;
+        sign *= (double)SoA0;
+    }
+    if (jF <= 0) {
+        sjF = 1 - jF;
+        sign *= (double)SoA1;
+    }
+    if (kF <= 0) {
+        skF = 1 - kF;
+        sign *= (double)SoA2;
+    }
+
+    return sign * src[(siF - 1)
+                    + (sjF - 1) * d_gp.ex[0]
+                    + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
+}
+
 /* ------------------------------------------------------------------ */
 /*  GPU buffer management                                              */
 /* ------------------------------------------------------------------ */
@@ -290,6 +318,7 @@ static const int STAGE_SLOT_COUNT =

 static constexpr int BSSN_STATE_COUNT = 24;
 static constexpr int BSSN_MATTER_COUNT = 10;
+static constexpr int BSSN_LK_FIELD_COUNT = 24;

 static const int k_state_input_slots[BSSN_STATE_COUNT] = {
    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
@@ -312,6 +341,52 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
    S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
 };

+static const int k_lk_adv_slots[BSSN_LK_FIELD_COUNT] = {
+    S_gxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_gyy, S_betay,
+    S_gyz, S_betaz, S_gzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz,
+    S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy
+};
+
+static const int k_lk_ko_slots[BSSN_LK_FIELD_COUNT] = {
+    S_dxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_dyy, S_betay,
+    S_gyz, S_betaz, S_dzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz,
+    S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy
+};
+
+static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
+    S_gxx_rhs, S_Gamz_rhs, S_gxy_rhs, S_Lap_rhs, S_gxz_rhs, S_betax_rhs,
+    S_gyy_rhs, S_betay_rhs, S_gyz_rhs, S_betaz_rhs, S_gzz_rhs, S_dtSfx_rhs,
+    S_Axx_rhs, S_dtSfy_rhs, S_Axy_rhs, S_dtSfz_rhs, S_Axz_rhs, S_Ayy_rhs,
+    S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
+};
+
+static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
+     1,  1,  1,
+     1,  1, -1,
+    -1, -1,  1,
+     1,  1,  1,
+    -1,  1, -1,
+    -1,  1,  1,
+     1,  1,  1,
+     1, -1,  1,
+     1, -1, -1,
+     1,  1, -1,
+     1,  1,  1,
+    -1,  1,  1,
+     1,  1,  1,
+     1, -1,  1,
+    -1, -1,  1,
+     1,  1, -1,
+    -1,  1, -1,
+     1,  1,  1,
+     1, -1, -1,
+     1,  1,  1,
+     1,  1,  1,
+     1,  1,  1,
+    -1,  1,  1,
+     1, -1,  1
+};
+
 struct StepContext {
    double *d_state0_mem;
    double *d_accum_mem;
@@ -326,9 +401,101 @@ struct StepContext {
    size_t cap_all;
    bool matter_ready;
    bool state_ready;
+
+    StepContext()
+        : d_state0_mem(nullptr), d_accum_mem(nullptr),
+          d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
+          d_matter_mem(nullptr), cap_all(0),
+          matter_ready(false), state_ready(false)
+    {
+        d_state0.fill(nullptr);
+        d_accum.fill(nullptr);
+        d_state_curr.fill(nullptr);
+        d_state_next.fill(nullptr);
+        d_matter.fill(nullptr);
+    }
+};
+
+struct StepAllocation {
+    double *d_state0_mem;
+    double *d_accum_mem;
+    double *d_state_curr_mem;
+    double *d_state_next_mem;
+    double *d_matter_mem;
+    size_t cap_all;
 };

 static std::unordered_map<void *, StepContext> g_step_ctx;
+static std::vector<StepAllocation> g_step_pool;
+
+static StepAllocation empty_step_allocation()
+{
+    StepAllocation alloc = {nullptr, nullptr, nullptr, nullptr, nullptr, 0};
+    return alloc;
+}
+
+static bool has_step_allocation(const StepAllocation &alloc)
+{
+    return alloc.cap_all != 0;
+}
+
+static StepAllocation detach_step_allocation(StepContext &ctx)
+{
+    StepAllocation alloc = {
+        ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem,
+        ctx.d_state_next_mem, ctx.d_matter_mem, ctx.cap_all
+    };
+    ctx.d_state0_mem = nullptr;
+    ctx.d_accum_mem = nullptr;
+    ctx.d_state_curr_mem = nullptr;
+    ctx.d_state_next_mem = nullptr;
+    ctx.d_matter_mem = nullptr;
+    ctx.cap_all = 0;
+    ctx.matter_ready = false;
+    ctx.state_ready = false;
+    ctx.d_state0.fill(nullptr);
+    ctx.d_accum.fill(nullptr);
+    ctx.d_state_curr.fill(nullptr);
+    ctx.d_state_next.fill(nullptr);
+    ctx.d_matter.fill(nullptr);
+    return alloc;
+}
+
+static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc)
+{
+    ctx.d_state0_mem = alloc.d_state0_mem;
+    ctx.d_accum_mem = alloc.d_accum_mem;
+    ctx.d_state_curr_mem = alloc.d_state_curr_mem;
+    ctx.d_state_next_mem = alloc.d_state_next_mem;
+    ctx.d_matter_mem = alloc.d_matter_mem;
+    ctx.cap_all = alloc.cap_all;
+    ctx.matter_ready = false;
+    ctx.state_ready = false;
+}
+
+static void recycle_step_allocation(StepAllocation &alloc)
+{
+    if (!has_step_allocation(alloc)) return;
+    g_step_pool.push_back(alloc);
+    alloc = empty_step_allocation();
+}
+
+static StepAllocation acquire_step_allocation(size_t all)
+{
+    size_t best = g_step_pool.size();
+    for (size_t i = 0; i < g_step_pool.size(); ++i) {
+        if (g_step_pool[i].cap_all < all) continue;
+        if (best == g_step_pool.size() || g_step_pool[i].cap_all < g_step_pool[best].cap_all)
+            best = i;
+    }
+    if (best == g_step_pool.size())
+        return empty_step_allocation();
+
+    StepAllocation alloc = g_step_pool[best];
+    g_step_pool[best] = g_step_pool.back();
+    g_step_pool.pop_back();
+    return alloc;
+}

 static void ensure_gpu_buffers(int nx, int ny, int nz) {
    size_t all = (size_t)nx * ny * nz;
@@ -385,34 +552,19 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
 {
    StepContext &ctx = g_step_ctx[block_tag];
    if (ctx.cap_all < all) {
-        if (ctx.d_state0_mem) {
-            cudaFree(ctx.d_state0_mem);
-            ctx.d_state0_mem = nullptr;
+        StepAllocation old_alloc = detach_step_allocation(ctx);
+        recycle_step_allocation(old_alloc);
+
+        StepAllocation alloc = acquire_step_allocation(all);
+        if (!has_step_allocation(alloc)) {
+            CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
+            alloc.cap_all = all;
        }
-        if (ctx.d_accum_mem) {
-            cudaFree(ctx.d_accum_mem);
-            ctx.d_accum_mem = nullptr;
-        }
-        if (ctx.d_state_curr_mem) {
-            cudaFree(ctx.d_state_curr_mem);
-            ctx.d_state_curr_mem = nullptr;
-        }
-        if (ctx.d_state_next_mem) {
-            cudaFree(ctx.d_state_next_mem);
-            ctx.d_state_next_mem = nullptr;
-        }
-        if (ctx.d_matter_mem) {
-            cudaFree(ctx.d_matter_mem);
-            ctx.d_matter_mem = nullptr;
-        }
-        CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-        CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-        CUDA_CHECK(cudaMalloc(&ctx.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-        CUDA_CHECK(cudaMalloc(&ctx.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-        CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
-        ctx.cap_all = all;
-        ctx.matter_ready = false;
-        ctx.state_ready = false;
+        attach_step_allocation(ctx, alloc);
    }
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
@@ -430,11 +582,8 @@ static void release_step_ctx(void *block_tag)
 {
    auto it = g_step_ctx.find(block_tag);
    if (it == g_step_ctx.end()) return;
-    if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
-    if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
-    if (it->second.d_state_curr_mem) cudaFree(it->second.d_state_curr_mem);
-    if (it->second.d_state_next_mem) cudaFree(it->second.d_state_next_mem);
-    if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
+    StepAllocation alloc = detach_step_allocation(it->second);
+    recycle_step_allocation(alloc);
    g_step_ctx.erase(it);
 }

@@ -1042,6 +1191,15 @@ void kern_kodis(const double * __restrict__ fh,
 /* ================================================================== */
 /*  Host wrapper helpers                                               */
 /* ================================================================== */
+__constant__ const double *d_lk_adv_fields[BSSN_LK_FIELD_COUNT];
+__constant__ const double *d_lk_ko_fields[BSSN_LK_FIELD_COUNT];
+__constant__ double *d_lk_rhs_fields[BSSN_LK_FIELD_COUNT];
+__constant__ int d_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT];
+
+__constant__ const double *d_rk4_f0_fields[BSSN_STATE_COUNT];
+__constant__ double *d_rk4_rhs_fields[BSSN_STATE_COUNT];
+__constant__ double *d_rk4_accum_fields[BSSN_STATE_COUNT];
+
 static const int BLK = 128;
 static inline int grid(size_t n) {
    if (n == 0) return 1;
@@ -1106,6 +1264,239 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
    }
 }

+__global__ __launch_bounds__(128, 4)
+void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
+                                 const double * __restrict__ Sfy,
+                                 const double * __restrict__ Sfz,
+                                 double eps_val)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_gp.all) return;
+
+    const int field = blockIdx.y;
+    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
+    const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
+    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
+    const int SoA0 = d_lk_soa_signs[3 * field + 0];
+    const int SoA1 = d_lk_soa_signs[3 * field + 1];
+    const int SoA2 = d_lk_soa_signs[3 * field + 2];
+    const double *adv_src = d_lk_adv_fields[field];
+    const double *ko_src = d_lk_ko_fields[field];
+    double *rhs = d_lk_rhs_fields[field];
+
+    const int i0 = tid % nx;
+    const int j0 = (tid / nx) % ny;
+    const int k0 = tid / (nx * ny);
+    const int iF = i0 + 1;
+    const int jF = j0 + 1;
+    const int kF = k0 + 1;
+
+    if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
+        double val = 0.0;
+
+        const double sfx = Sfx[tid];
+        if (sfx > 0.0) {
+            if (i0 <= nx - 4) {
+                val += sfx * d_gp.d12dx * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2));
+            } else if (i0 <= nx - 3) {
+                val += sfx * d_gp.d12dx * (
+                    fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
+                    -       fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
+            } else {
+                val -= sfx * d_gp.d12dx * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2));
+            }
+        } else if (sfx < 0.0) {
+            if ((i0 - 2) >= iminF) {
+                val -= sfx * d_gp.d12dx * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2));
+            } else if ((i0 - 1) >= iminF) {
+                val += sfx * d_gp.d12dx * (
+                    fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
+                    -       fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
+            } else if (i0 >= iminF) {
+                val += sfx * d_gp.d12dx * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2));
+            }
+        }
+
+        const double sfy = Sfy[tid];
+        if (sfy > 0.0) {
+            if (j0 <= ny - 4) {
+                val += sfy * d_gp.d12dy * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2));
+            } else if (j0 <= ny - 3) {
+                val += sfy * d_gp.d12dy * (
+                    fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
+                    -       fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
+            } else {
+                val -= sfy * d_gp.d12dy * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2));
+            }
+        } else if (sfy < 0.0) {
+            if ((j0 - 2) >= jminF) {
+                val -= sfy * d_gp.d12dy * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2));
+            } else if ((j0 - 1) >= jminF) {
+                val += sfy * d_gp.d12dy * (
+                    fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
+                    -       fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
+            } else if (j0 >= jminF) {
+                val += sfy * d_gp.d12dy * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2));
+            }
+        }
+
+        const double sfz = Sfz[tid];
+        if (sfz > 0.0) {
+            if (k0 <= nz - 4) {
+                val += sfz * d_gp.d12dz * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2));
+            } else if (k0 <= nz - 3) {
+                val += sfz * d_gp.d12dz * (
+                    fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
+                    -       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
+            } else {
+                val -= sfz * d_gp.d12dz * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2));
+            }
+        } else if (sfz < 0.0) {
+            if ((k0 - 2) >= kminF) {
+                val -= sfz * d_gp.d12dz * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2));
+            } else if ((k0 - 1) >= kminF) {
+                val += sfz * d_gp.d12dz * (
+                    fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
+                    -       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
+            } else if (k0 >= kminF) {
+                val += sfz * d_gp.d12dz * (
+                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
+                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
+                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)
+                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2));
+            }
+        }
+
+        rhs[tid] += val;
+    }
+
+    if (eps_val > 0.0 &&
+        (iF - 3) >= iminF && (iF + 3) <= imaxF &&
+        (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
+        (kF - 3) >= kminF && (kF + 3) <= kmaxF)
+    {
+        const double cof = 64.0;
+        const double Dx =
+            (fetch_sym_ord3_direct(ko_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)
+           + fetch_sym_ord3_direct(ko_src, iF + 3, jF, kF, SoA0, SoA1, SoA2))
+          - 6.0 * (fetch_sym_ord3_direct(ko_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+                 + fetch_sym_ord3_direct(ko_src, iF + 2, jF, kF, SoA0, SoA1, SoA2))
+          + 15.0 * (fetch_sym_ord3_direct(ko_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+                  + fetch_sym_ord3_direct(ko_src, iF + 1, jF, kF, SoA0, SoA1, SoA2))
+          - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);
+
+        const double Dy =
+            (fetch_sym_ord3_direct(ko_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)
+           + fetch_sym_ord3_direct(ko_src, iF, jF + 3, kF, SoA0, SoA1, SoA2))
+          - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+                 + fetch_sym_ord3_direct(ko_src, iF, jF + 2, kF, SoA0, SoA1, SoA2))
+          + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+                  + fetch_sym_ord3_direct(ko_src, iF, jF + 1, kF, SoA0, SoA1, SoA2))
+          - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);
+
+        const double Dz =
+            (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)
+           + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 3, SoA0, SoA1, SoA2))
+          - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+                 + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 2, SoA0, SoA1, SoA2))
+          + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+                  + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 1, SoA0, SoA1, SoA2))
+          - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);
+
+        rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
+    }
+}
+
+static void gpu_lopsided_kodis_state_batch(double eps_val, int all)
+{
+    const double *adv_fields[BSSN_LK_FIELD_COUNT];
+    const double *ko_fields[BSSN_LK_FIELD_COUNT];
+    double *rhs_fields[BSSN_LK_FIELD_COUNT];
+
+    for (int i = 0; i < BSSN_LK_FIELD_COUNT; ++i) {
+        adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]];
+        ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]];
+        rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]];
+    }
+
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lk_adv_fields, adv_fields, sizeof(adv_fields)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lk_ko_fields, ko_fields, sizeof(ko_fields)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lk_rhs_fields, rhs_fields, sizeof(rhs_fields)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_lk_soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs)));
+
+    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_LK_FIELD_COUNT);
+    kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
+        g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], eps_val);
+}
+
 __global__ void kern_rk4_finalize(const double * __restrict__ f0,
                                  double * __restrict__ frhs,
                                  double * __restrict__ accum,
@@ -1137,6 +1528,63 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0,
    }
 }

+__global__ __launch_bounds__(128, 4)
+void kern_rk4_finalize_batched(double dT, int rk4_stage, double chitiny)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_gp.all) return;
+
+    const int field = blockIdx.y;
+    const double *f0 = d_rk4_f0_fields[field];
+    double *frhs = d_rk4_rhs_fields[field];
+    double *accum = d_rk4_accum_fields[field];
+
+    const double rhs = frhs[tid];
+    switch (rk4_stage) {
+    case 0:
+        accum[tid] = rhs;
+        frhs[tid] = f0[tid] + 0.5 * dT * rhs;
+        break;
+    case 1:
+        accum[tid] += 2.0 * rhs;
+        frhs[tid] = f0[tid] + 0.5 * dT * rhs;
+        break;
+    case 2:
+        accum[tid] += 2.0 * rhs;
+        frhs[tid] = f0[tid] + dT * rhs;
+        break;
+    default:
+        frhs[tid] = f0[tid] + (dT / 6.0) * (accum[tid] + rhs);
+        break;
+    }
+
+    if (field == 0 && frhs[tid] < chitiny) frhs[tid] = chitiny;
+}
+
+static void gpu_rk4_finalize_batch(const StepContext &ctx,
+                                   size_t all,
+                                   double dT,
+                                   int rk4_stage,
+                                   double chitiny)
+{
+    const double *f0_fields[BSSN_STATE_COUNT];
+    double *rhs_fields[BSSN_STATE_COUNT];
+    double *accum_fields[BSSN_STATE_COUNT];
+
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        f0_fields[i] = ctx.d_state0[i];
+        rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
+        accum_fields[i] = ctx.d_accum[i];
+    }
+
+    CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_f0_fields, f0_fields, sizeof(f0_fields)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_rhs_fields, rhs_fields, sizeof(rhs_fields)));
+    CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_accum_fields, accum_fields, sizeof(accum_fields)));
+
+    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
+    kern_rk4_finalize_batched<<<launch_grid, BLK>>>(dT, rk4_stage, chitiny);
+}
+
 __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
                                     double * __restrict__ gxy,
                                     double * __restrict__ gxz,
@@ -2862,30 +3310,7 @@ static void launch_rhs_pipeline(int all, double eps, int co)
        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
        D(S_f_arr), D(S_S_arr));

-    gpu_lopsided_kodis(D(S_gxx),   D(S_dxx),   D(S_gxx_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Gamz),  D(S_Gamz),  D(S_Gamz_rhs),  D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_gxy),   D(S_gxy),   D(S_gxy_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM,   eps, all);
-    gpu_lopsided_kodis(D(S_Lap),   D(S_Lap),   D(S_Lap_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_gxz),   D(S_gxz),   D(S_gxz_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_gyy),   D(S_dyy),   D(S_gyy_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_gyz),   D(S_gyz),   D(S_gyz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,    eps, all);
-    gpu_lopsided_kodis(D(S_gzz),   D(S_dzz),   D(S_gzz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Axx),   D(S_Axx),   D(S_Axx_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Axy),   D(S_Axy),   D(S_Axy_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM,   eps, all);
-    gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,    eps, all);
-    gpu_lopsided_kodis(D(S_Axz),   D(S_Axz),   D(S_Axz_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_Ayy),   D(S_Ayy),   D(S_Ayy_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Ayz),   D(S_Ayz),   D(S_Ayz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_Azz),   D(S_Azz),   D(S_Azz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_chi),   D(S_chi),   D(S_chi_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_trK),   D(S_trK),   D(S_trK_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Gamx),  D(S_Gamx),  D(S_Gamx_rhs),  D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Gamy),  D(S_Gamy),  D(S_Gamy_rhs),  D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
+    gpu_lopsided_kodis_state_batch(eps, all);

    if (co == 0) {
        gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM,      all);
@@ -3395,30 +3820,7 @@ int f_compute_rhs_bssn(int *ex, double &T,
    /* ============================================================ */
    /*  Phase 16/17: advection + KO dissipation (shared ord=3 pack)  */
    /* ============================================================ */
-    gpu_lopsided_kodis(D(S_gxx),   D(S_dxx),   D(S_gxx_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Gamz),  D(S_Gamz),  D(S_Gamz_rhs),  D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_gxy),   D(S_gxy),   D(S_gxy_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM,   eps, all);
-    gpu_lopsided_kodis(D(S_Lap),   D(S_Lap),   D(S_Lap_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_gxz),   D(S_gxz),   D(S_gxz_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_gyy),   D(S_dyy),   D(S_gyy_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_gyz),   D(S_gyz),   D(S_gyz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,    eps, all);
-    gpu_lopsided_kodis(D(S_gzz),   D(S_dzz),   D(S_gzz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Axx),   D(S_Axx),   D(S_Axx_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Axy),   D(S_Axy),   D(S_Axy_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM,   eps, all);
-    gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI,    eps, all);
-    gpu_lopsided_kodis(D(S_Axz),   D(S_Axz),   D(S_Axz_rhs),   D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_Ayy),   D(S_Ayy),   D(S_Ayy_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Ayz),   D(S_Ayz),   D(S_Ayz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI,   eps, all);
-    gpu_lopsided_kodis(D(S_Azz),   D(S_Azz),   D(S_Azz_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_chi),   D(S_chi),   D(S_chi_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_trK),   D(S_trK),   D(S_trK_rhs),   D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Gamx),  D(S_Gamx),  D(S_Gamx_rhs),  D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM,    eps, all);
-    gpu_lopsided_kodis(D(S_Gamy),  D(S_Gamy),  D(S_Gamy_rhs),  D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM,    eps, all);
+    gpu_lopsided_kodis_state_batch(eps, all);

    /* ============================================================ */
    /*  Phase 18: Hamilton & momentum constraints (co==0)             */
@@ -3604,15 +4006,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
-        kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
-                                              g_buf.slot[k_state_rhs_slots[i]],
-                                              ctx.d_accum[i],
-                                              dT,
-                                              RK4);
-    }
-
-    kern_lowerboundset_cuda<<<grid(all), BLK>>>(g_buf.slot[S_chi_rhs], chitiny);
+    gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
    if (profile) {
        cuda_profile_sync();
        finalize_ms += cuda_profile_now_ms() - t0;