AMSS-NCKU/AMSS_NCKU_source/bssn_rhs_cuda.cu

/*
 * bssn_rhs_cuda.cu — GPU implementation of f_compute_rhs_bssn
 *
 * Drop-in replacement for bssn_rhs_c.C.
 * Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o.
 */

#include <array>
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <cstring>
#include <unordered_map>
#include <vector>
#include <cuda_runtime.h>
#include "macrodef.h"
#include "bssn_rhs.h"

extern "C" {
#ifdef fortran1
void set_escalar_parameter(double &, double &, double &, double &, double &);
#endif
#ifdef fortran2
void SET_ESCALAR_PARAMETER(double &, double &, double &, double &, double &);
#endif
#ifdef fortran3
void set_escalar_parameter_(double &, double &, double &, double &, double &);
#endif
}

/* ------------------------------------------------------------------ */
/*  Multi-GPU dispatch: distribute ranks across available GPUs          */
/* ------------------------------------------------------------------ */
static struct {
    int  num_gpus;
    int  my_rank;
    int  my_local_rank;
    int  my_device;
    bool inited;
} g_dispatch = {0, -1, -1, -1, false};

static int env_to_int(const char *name, int fallback = -1) {
    const char *v = getenv(name);
    if (!v || !*v) return fallback;
    return atoi(v);
}

static void init_gpu_dispatch() {
    if (g_dispatch.inited) return;
    cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus);
    if (err != cudaSuccess) g_dispatch.num_gpus = 1;
    if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;

    /* Get MPI rank from environment (set by mpirun/mpiexec). */
    g_dispatch.my_rank = env_to_int("PMI_RANK",
                         env_to_int("OMPI_COMM_WORLD_RANK",
                         env_to_int("MV2_COMM_WORLD_RANK",
                         env_to_int("SLURM_PROCID", 0))));

    /* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */
    g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK",
                               env_to_int("MV2_COMM_WORLD_LOCAL_RANK",
                               env_to_int("MPI_LOCALRANKID",
                               env_to_int("SLURM_LOCALID", -1))));

    const int rank_for_map = (g_dispatch.my_local_rank >= 0)
                           ? g_dispatch.my_local_rank : g_dispatch.my_rank;
    g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus;
    cudaSetDevice(g_dispatch.my_device);

    if (g_dispatch.my_rank == 0) {
        printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n",
               g_dispatch.num_gpus,
               (g_dispatch.my_local_rank >= 0) ? "local" : "global");
    }
    g_dispatch.inited = true;
}

struct CudaProfileStats {
    long long calls;
    double total_ms;
    double state_ms;
    double matter_ms;
    double rhs_ms;
    double bc_ms;
    double finalize_ms;
    double output_ms;
    long long upload_calls;
    long long resident_download_calls;
    double upload_ms;
    double resident_download_ms;
    double upload_gb;
    double resident_download_gb;
};

enum RhsStageId {
    RHS_STAGE_PREP = 0,
    RHS_STAGE_DERIV1,
    RHS_STAGE_METRIC,
    RHS_STAGE_GAUGE_DERIV,
    RHS_STAGE_GAMMA_CONTRACT,
    RHS_STAGE_RICCI_DIFF,
    RHS_STAGE_RICCI_FUSED,
    RHS_STAGE_CHI,
    RHS_STAGE_GAUGE_RHS,
    RHS_STAGE_KODIS,
    RHS_STAGE_CONSTRAINTS,
    RHS_STAGE_COUNT
};

struct RhsStageProfileStats {
    long long calls;
    double ms[RHS_STAGE_COUNT];
};

struct CudaAuxProfileStats {
    long long prepare_calls;
    long long writeback_calls;
    double prepare_ms;
    double writeback_ms;
    double writeback_gb;
};

static CudaProfileStats &cuda_profile_stats() {
    static CudaProfileStats stats = {
        0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
        0, 0, 0.0, 0.0, 0.0, 0.0
    };
    return stats;
}

static CudaAuxProfileStats &cuda_aux_profile_stats() {
    static CudaAuxProfileStats stats = {};
    return stats;
}

static bool cuda_aux_profile_enabled() {
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_PROFILE_CUDA_AUX");
        enabled = (env && atoi(env) != 0) ? 1 : 0;
    }
    return enabled != 0;
}

static int cuda_aux_profile_every() {
    static int every = -1;
    if (every < 0) {
        const char *env = getenv("AMSS_PROFILE_CUDA_AUX_EVERY");
        every = (env && atoi(env) > 0) ? atoi(env) : 100;
    }
    return every;
}

static void cuda_aux_profile_maybe_log() {
    if (!cuda_aux_profile_enabled()) return;
    CudaAuxProfileStats &stats = cuda_aux_profile_stats();
    const long long calls = stats.prepare_calls + stats.writeback_calls;
    if (calls <= 0 || calls % cuda_aux_profile_every() != 0) return;
    fprintf(stderr,
            "[AMSS-CUDA-AUX][rank %d][dev %d] prepare=%lld avg_prepare=%.3f ms writebacks=%lld avg_writeback=%.3f ms writeback_GB=%.3f\n",
            g_dispatch.my_rank, g_dispatch.my_device,
            stats.prepare_calls,
            stats.prepare_calls ? stats.prepare_ms / (double)stats.prepare_calls : 0.0,
            stats.writeback_calls,
            stats.writeback_calls ? stats.writeback_ms / (double)stats.writeback_calls : 0.0,
            stats.writeback_gb);
    fflush(stderr);
}

static RhsStageProfileStats &rhs_stage_profile_stats() {
    static RhsStageProfileStats stats = {};
    return stats;
}

static bool cuda_profile_enabled() {
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_PROFILE_CUDA");
        enabled = (env && atoi(env) != 0) ? 1 : 0;
    }
    return enabled != 0;
}

static int cuda_profile_every() {
    static int every = -1;
    if (every < 0) {
        const char *env = getenv("AMSS_PROFILE_CUDA_EVERY");
        every = (env && atoi(env) > 0) ? atoi(env) : 100;
    }
    return every;
}

static bool rhs_stage_timing_enabled() {
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_GPU_STAGE_TIMING");
        enabled = (env && atoi(env) != 0) ? 1 : 0;
    }
    return enabled != 0;
}

static int rhs_stage_timing_every() {
    static int every = -1;
    if (every < 0) {
        const char *env = getenv("AMSS_GPU_STAGE_TIMING_EVERY");
        every = (env && atoi(env) > 0) ? atoi(env) : cuda_profile_every();
    }
    return every;
}

static double cuda_profile_now_ms() {
    using clock = std::chrono::steady_clock;
    return std::chrono::duration<double, std::milli>(
               clock::now().time_since_epoch()).count();
}

static void cuda_profile_sync() {
    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error %s:%d: %s\n",
                __FILE__, __LINE__, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
}

static void cuda_profile_maybe_log() {
    if (!cuda_profile_enabled()) return;
    CudaProfileStats &stats = cuda_profile_stats();
    if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
    fprintf(stderr,
            "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms"
            " uploads=%lld avg_upload=%.3f ms upload_GB=%.3f resident_downloads=%lld avg_resident_download=%.3f ms resident_download_GB=%.3f\n",
            g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
            stats.total_ms / (double)stats.calls,
            stats.state_ms / (double)stats.calls,
            stats.matter_ms / (double)stats.calls,
            stats.rhs_ms / (double)stats.calls,
            stats.bc_ms / (double)stats.calls,
            stats.finalize_ms / (double)stats.calls,
            stats.output_ms / (double)stats.calls,
            stats.upload_calls,
            stats.upload_calls ? stats.upload_ms / (double)stats.upload_calls : 0.0,
            stats.upload_gb,
            stats.resident_download_calls,
            stats.resident_download_calls ? stats.resident_download_ms / (double)stats.resident_download_calls : 0.0,
            stats.resident_download_gb);
    fflush(stderr);
}

static void rhs_stage_profile_accumulate(const double *stage_ms) {
    if (!rhs_stage_timing_enabled()) return;

    RhsStageProfileStats &stats = rhs_stage_profile_stats();
    stats.calls++;
    for (int i = 0; i < RHS_STAGE_COUNT; ++i) {
        stats.ms[i] += stage_ms[i];
    }
    if (stats.calls <= 0 || stats.calls % rhs_stage_timing_every() != 0) return;

    fprintf(stderr,
            "[AMSS-CUDA-STAGE][rank %d][dev %d] calls=%lld"
            " prep=%.3f deriv1=%.3f metric=%.3f gauge_deriv=%.3f"
            " gamma_contract=%.3f ricci_diff=%.3f ricci_fused=%.3f"
            " chi=%.3f gauge_rhs=%.3f kodis=%.3f constraints=%.3f ms\n",
            g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
            stats.ms[RHS_STAGE_PREP] / (double)stats.calls,
            stats.ms[RHS_STAGE_DERIV1] / (double)stats.calls,
            stats.ms[RHS_STAGE_METRIC] / (double)stats.calls,
            stats.ms[RHS_STAGE_GAUGE_DERIV] / (double)stats.calls,
            stats.ms[RHS_STAGE_GAMMA_CONTRACT] / (double)stats.calls,
            stats.ms[RHS_STAGE_RICCI_DIFF] / (double)stats.calls,
            stats.ms[RHS_STAGE_RICCI_FUSED] / (double)stats.calls,
            stats.ms[RHS_STAGE_CHI] / (double)stats.calls,
            stats.ms[RHS_STAGE_GAUGE_RHS] / (double)stats.calls,
            stats.ms[RHS_STAGE_KODIS] / (double)stats.calls,
            stats.ms[RHS_STAGE_CONSTRAINTS] / (double)stats.calls);
    fflush(stderr);
}

/* ------------------------------------------------------------------ */
/*  Error checking                                                     */
/* ------------------------------------------------------------------ */
#define CUDA_CHECK(call) do {                                         \
    cudaError_t err = (call);                                         \
    if (err != cudaSuccess) {                                         \
        fprintf(stderr, "CUDA error %s:%d: %s\n",                    \
                __FILE__, __LINE__, cudaGetErrorString(err));         \
        exit(EXIT_FAILURE);                                           \
    }                                                                 \
} while(0)

/* ------------------------------------------------------------------ */
/*  Physical / gauge constants (matching bssn_rhs_c.C)                 */
/* ------------------------------------------------------------------ */
static const double PI_VAL  = 3.14159265358979323846;
static const double FF_VAL  = 0.75;
static const double ETA_VAL = 2.0;

/* ------------------------------------------------------------------ */
/*  Constant memory for grid parameters and stencil coefficients       */
/* ------------------------------------------------------------------ */
struct GridParams {
    int    ex[3];          /* nx, ny, nz */
    int    all;            /* nx*ny*nz   */
    double dX, dY, dZ;
    /* fderivs coefficients */
    double d12dx, d12dy, d12dz;   /* 1/(12*dX) etc */
    double d2dx,  d2dy,  d2dz;   /* 1/(2*dX)  etc */
    /* fdderivs coefficients */
    double Fdxdx, Fdydy, Fdzdz;  /* 1/(12*dX^2) etc */
    double Sdxdx, Sdydy, Sdzdz;  /* 1/(dX^2)    etc */
    double Fdxdy, Fdxdz, Fdydz;  /* 1/(144*dX*dY) etc */
    double Sdxdy, Sdxdz, Sdydz;  /* 1/(4*dX*dY)   etc */
    /* symmetry bounds (Fortran 1-based) */
    int iminF, jminF, kminF;
    int imaxF, jmaxF, kmaxF;
    /* symmetry bounds for ord=3 (lopsided/kodis) */
    int iminF3, jminF3, kminF3;
    int Symmetry;
    double eps;
    int co;
    /* padded sizes */
    int fh2_nx, fh2_ny, fh2_nz;  /* (nx+2), (ny+2), (nz+2) for ord=2 */
    int fh3_nx, fh3_ny, fh3_nz;  /* (nx+3), (ny+3), (nz+3) for ord=3 */
};

__constant__ GridParams d_gp;
static GridParams g_gp_host_cache = {};
static bool g_gp_host_cache_valid = false;

/* ------------------------------------------------------------------ */
/*  Device indexing helpers                                             */
/* ------------------------------------------------------------------ */
__device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) {
    return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1];
}

__device__ __forceinline__ double fetch_sym_ord2_direct(const double *src,
                                                        int iF, int jF, int kF,
                                                        int SoA0, int SoA1, int SoA2)
{
    int siF = iF;
    int sjF = jF;
    int skF = kF;
    double sign = 1.0;

    if (iF <= 0) {
        siF = 1 - iF;
        sign *= (double)SoA0;
    }
    if (jF <= 0) {
        sjF = 1 - jF;
        sign *= (double)SoA1;
    }
    if (kF <= 0) {
        skF = 1 - kF;
        sign *= (double)SoA2;
    }

    return sign * src[(siF - 1)
                    + (sjF - 1) * d_gp.ex[0]
                    + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
}

/* ord=2 ghost-padded: Fortran index iF -> flat index */
__device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) {
    return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny;
}

/* ord=3 ghost-padded: Fortran index iF -> flat index */
__device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) {
    return (iF + 2) + (jF + 2) * d_gp.fh3_nx + (kF + 2) * d_gp.fh3_nx * d_gp.fh3_ny;
}

__device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
                                                        int iF, int jF, int kF,
                                                        int SoA0, int SoA1, int SoA2)
{
    int siF = iF;
    int sjF = jF;
    int skF = kF;
    double sign = 1.0;

    if (iF <= 0) {
        siF = 1 - iF;
        sign *= (double)SoA0;
    }
    if (jF <= 0) {
        sjF = 1 - jF;
        sign *= (double)SoA1;
    }
    if (kF <= 0) {
        skF = 1 - kF;
        sign *= (double)SoA2;
    }

    return sign * src[(siF - 1)
                    + (sjF - 1) * d_gp.ex[0]
                    + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
}

#include "fd_cuda_helpers.cuh"

/* ------------------------------------------------------------------ */
/*  GPU buffer management                                              */
/* ------------------------------------------------------------------ */
/*
 * Array slot indices — all arrays live in one big cudaMalloc block.
 * INPUT arrays (H2D): 39 slots
 * OUTPUT arrays (D2H): 52 slots
 * TEMPORARY arrays (GPU-only): ~65 slots
 * Plus 2 extended arrays for ghost-padded stencils (fh_ord2, fh_ord3)
 */

/* Total number of "all"-sized slots */
#define NUM_SLOTS 256

struct GpuBuffers {
    double *d_mem;          /* single big allocation */
    double *d_fh2;          /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
    double *d_fh3;          /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
    double *h_stage;        /* host staging buffer for bulk H2D/D2H */
    bool    h_stage_pinned; /* true if allocated by cudaMallocHost */
    double *slot[NUM_SLOTS]; /* pointers into d_mem */
    size_t  cap_all;
    size_t  cap_fh2_size;
    size_t  cap_fh3_size;
    int     prev_nx, prev_ny, prev_nz;
    bool    initialized;
};

static GpuBuffers g_buf = {
    nullptr, nullptr, nullptr, nullptr, false, {},
    0, 0, 0, 0, 0, 0, false
};

/* Slot assignments — INPUT (H2D) */
enum {
    S_chi=0, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
    S_Gamx, S_Gamy, S_Gamz,
    S_Lap, S_betax, S_betay, S_betaz,
    S_dtSfx, S_dtSfy, S_dtSfz,
    S_rho, S_Sx, S_Sy, S_Sz,
    S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz,
    S_X, S_Y, S_Z,                /* coordinate arrays — only nx/ny/nz long */
    /* 37 input slots so far; X/Y/Z are special-sized */

    /* OUTPUT (D2H) */
    S_chi_rhs, S_trK_rhs,
    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
    S_Gamxxx, S_Gamxxy, S_Gamxxz, S_Gamxyy, S_Gamxyz, S_Gamxzz,
    S_Gamyxx, S_Gamyxy, S_Gamyxz, S_Gamyyy, S_Gamyyz, S_Gamyzz,
    S_Gamzxx, S_Gamzxy, S_Gamzxz, S_Gamzyy, S_Gamzyz, S_Gamzzz,
    S_Rxx, S_Rxy, S_Rxz, S_Ryy, S_Ryz, S_Rzz,
    S_ham_Res, S_movx_Res, S_movy_Res, S_movz_Res,
    S_Gmx_Res, S_Gmy_Res, S_Gmz_Res,

    /* TEMPORARY (GPU-only) */
    S_gxx, S_gyy, S_gzz,          /* physical metric = dxx+1 etc */
    S_alpn1, S_chin1,
    S_chix, S_chiy, S_chiz,
    S_gxxx, S_gxyx, S_gxzx, S_gyyx, S_gyzx, S_gzzx,
    S_gxxy, S_gxyy, S_gxzy, S_gyyy, S_gyzy, S_gzzy,
    S_gxxz, S_gxyz, S_gxzz, S_gyyz, S_gyzz, S_gzzz,
    S_Lapx, S_Lapy, S_Lapz,
    S_betaxx, S_betaxy, S_betaxz,
    S_betayx, S_betayy, S_betayz,
    S_betazx, S_betazy, S_betazz,
    S_Gamxx, S_Gamxy, S_Gamxz,
    S_Gamyx, S_Gamyy_t, S_Gamyz_t,
    S_Gamzx, S_Gamzy, S_Gamzz_t,
    S_Kx, S_Ky, S_Kz,
    S_S_arr, S_f_arr,
    S_fxx, S_fxy, S_fxz, S_fyy, S_fyz, S_fzz,
    S_Gamxa, S_Gamya, S_Gamza,
    S_gupxx, S_gupxy, S_gupxz,
    S_gupyy, S_gupyz, S_gupzz,
    S_Sphi, S_Spi, S_Sphi_rhs, S_Spi_rhs,
    S_Sphi0, S_Spi0, S_Sphi_accum, S_Spi_accum,
    S_Sphi_next, S_Spi_next,
    S_Sphi_x, S_Sphi_y, S_Sphi_z,
    S_Sphi_xx, S_Sphi_xy, S_Sphi_xz, S_Sphi_yy, S_Sphi_yz, S_Sphi_zz,
    S_trK_x, S_trK_y, S_trK_z,
    S_EM_Kpsi, S_EM_Kphi,
    S_EM_Ex, S_EM_Ey, S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz,
    S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar,
    S_EM_Kpsi_rhs, S_EM_Kphi_rhs,
    S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs,
    S_EM_Kpsix, S_EM_Kpsiy, S_EM_Kpsiz,
    S_EM_Kphix, S_EM_Kphiy, S_EM_Kphiz,
    S_EM_Exx, S_EM_Exy, S_EM_Exz,
    S_EM_Eyx, S_EM_Eyy, S_EM_Eyz,
    S_EM_Ezx, S_EM_Ezy, S_EM_Ezz,
    S_EM_Bxx, S_EM_Bxy, S_EM_Bxz,
    S_EM_Byx, S_EM_Byy, S_EM_Byz,
    S_EM_Bzx, S_EM_Bzy, S_EM_Bzz,
    NUM_USED_SLOTS
};

static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS");

static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1);
static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1);
static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1);
static const int STAGE_SLOT_COUNT =
    (H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT))
        ? H2D_INPUT_SLOT_COUNT
        : (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT);

static constexpr int BSSN_STATE_COUNT = 24;
static constexpr int BSSN_MATTER_COUNT = 10;
static constexpr int BSSN_LK_FIELD_COUNT = 24;
static constexpr int BSSN_ESCALAR_LK_FIELD_COUNT = 26;
static constexpr int BSSN_EM_STATE_COUNT = 32;
static constexpr int BSSN_EM_SOURCE_COUNT = 4;
static constexpr int BSSN_EM_LK_FIELD_COUNT = 32;
static constexpr int BSSN_RESIDENT_BANK_COUNT = 6;
static constexpr int BSSN_ESCALAR_STATE_COUNT = 26;
static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_EM_STATE_COUNT;

static const int k_state_input_slots[BSSN_STATE_COUNT] = {
    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
    S_Gamx, S_Gamy, S_Gamz,
    S_Lap, S_betax, S_betay, S_betaz,
    S_dtSfx, S_dtSfy, S_dtSfz
};

static const int k_state_rhs_slots[BSSN_STATE_COUNT] = {
    S_chi_rhs, S_trK_rhs,
    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs
};

static const int k_escalar_state_input_slots[BSSN_ESCALAR_STATE_COUNT] = {
    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
    S_Gamx, S_Gamy, S_Gamz,
    S_Lap, S_betax, S_betay, S_betaz,
    S_dtSfx, S_dtSfy, S_dtSfz,
    S_Sphi, S_Spi
};

static const int k_escalar_state_rhs_slots[BSSN_ESCALAR_STATE_COUNT] = {
    S_chi_rhs, S_trK_rhs,
    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
    S_Sphi_rhs, S_Spi_rhs
};

static const int k_em_state_input_slots[BSSN_EM_STATE_COUNT] = {
    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
    S_Gamx, S_Gamy, S_Gamz,
    S_Lap, S_betax, S_betay, S_betaz,
    S_dtSfx, S_dtSfy, S_dtSfz,
    S_EM_Kpsi, S_EM_Kphi,
    S_EM_Ex, S_EM_Ey, S_EM_Ez,
    S_EM_Bx, S_EM_By, S_EM_Bz
};

static const int k_em_state_rhs_slots[BSSN_EM_STATE_COUNT] = {
    S_chi_rhs, S_trK_rhs,
    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
    S_EM_Kpsi_rhs, S_EM_Kphi_rhs,
    S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs,
    S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs
};

static const int k_matter_slots[BSSN_MATTER_COUNT] = {
    S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
};

static const int k_lk_adv_slots[BSSN_LK_FIELD_COUNT] = {
    S_gxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_gyy, S_betay,
    S_gyz, S_betaz, S_gzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz,
    S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy
};

static const int k_lk_ko_slots[BSSN_LK_FIELD_COUNT] = {
    S_dxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_dyy, S_betay,
    S_gyz, S_betaz, S_dzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz,
    S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy
};

static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
    S_gxx_rhs, S_Gamz_rhs, S_gxy_rhs, S_Lap_rhs, S_gxz_rhs, S_betax_rhs,
    S_gyy_rhs, S_betay_rhs, S_gyz_rhs, S_betaz_rhs, S_gzz_rhs, S_dtSfx_rhs,
    S_Axx_rhs, S_dtSfy_rhs, S_Axy_rhs, S_dtSfz_rhs, S_Axz_rhs, S_Ayy_rhs,
    S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
};

__constant__ int d_subset_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
__constant__ double d_comm_state_soa[3 * BSSN_RESIDENT_STATE_CAPACITY];

static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
     1,  1,  1,
     1,  1, -1,
    -1, -1,  1,
     1,  1,  1,
    -1,  1, -1,
    -1,  1,  1,
     1,  1,  1,
     1, -1,  1,
     1, -1, -1,
     1,  1, -1,
     1,  1,  1,
    -1,  1,  1,
     1,  1,  1,
     1, -1,  1,
    -1, -1,  1,
     1,  1, -1,
    -1,  1, -1,
     1,  1,  1,
     1, -1, -1,
     1,  1,  1,
     1,  1,  1,
     1,  1,  1,
    -1,  1,  1,
     1, -1,  1
};

struct StepContext {
    double *d_state0_mem;
    double *d_accum_mem;
    double *d_state_curr_mem;
    double *d_state_next_mem;
    std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
    double *d_matter_mem;
    double *d_em_source_mem;
    double *d_comm_mem;
    double *h_comm_mem;
    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state0;
    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_accum;
    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state_curr;
    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state_next;
    std::array<std::array<double *, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> d_resident;
    std::array<std::array<double *, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> resident_host;
    std::array<std::array<unsigned char, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> resident_host_clean;
    std::array<unsigned long long, BSSN_RESIDENT_BANK_COUNT> resident_age;
    std::array<bool, BSSN_RESIDENT_BANK_COUNT> resident_valid;
    std::array<double *, BSSN_MATTER_COUNT> d_matter;
    std::array<double *, BSSN_EM_SOURCE_COUNT> d_em_source;
    std::array<double *, BSSN_EM_SOURCE_COUNT> em_source_host;
    size_t cap_all;
    size_t cap_comm;
    bool h_comm_pinned;
    size_t cap_h_comm;
    bool matter_ready;
    bool em_source_ready;
    bool em_zero_fast_known;
    bool em_zero_fast;
    bool state_ready;
    int current_bank;
    unsigned long long resident_clock;

    StepContext()
        : d_state0_mem(nullptr), d_accum_mem(nullptr),
          d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
          d_resident_mem{},
          d_matter_mem(nullptr), d_em_source_mem(nullptr),
          d_comm_mem(nullptr), h_comm_mem(nullptr),
          cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
          matter_ready(false), em_source_ready(false),
          em_zero_fast_known(false), em_zero_fast(false), state_ready(false),
          current_bank(-1), resident_clock(0)
    {
        d_resident_mem.fill(nullptr);
        d_state0.fill(nullptr);
        d_accum.fill(nullptr);
        d_state_curr.fill(nullptr);
        d_state_next.fill(nullptr);
        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
            d_resident[b].fill(nullptr);
            resident_host[b].fill(nullptr);
            resident_host_clean[b].fill(0);
        }
        resident_age.fill(0);
        resident_valid.fill(false);
        d_matter.fill(nullptr);
        d_em_source.fill(nullptr);
        em_source_host.fill(nullptr);
    }
};

struct StepAllocation {
    double *d_state0_mem;
    double *d_accum_mem;
    std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
    double *d_matter_mem;
    double *d_em_source_mem;
    double *d_comm_mem;
    double *h_comm_mem;
    size_t cap_all;
    size_t cap_comm;
    bool h_comm_pinned;
    size_t cap_h_comm;
};

static std::unordered_map<void *, StepContext> g_step_ctx;
static std::vector<StepAllocation> g_step_pool;
static int *g_comm_segment_meta = nullptr;
static size_t g_comm_segment_meta_cap = 0;
static int *g_em_zero_flag = nullptr;

static StepAllocation empty_step_allocation()
{
    StepAllocation alloc = {};
    alloc.d_state0_mem = nullptr;
    alloc.d_accum_mem = nullptr;
    alloc.d_resident_mem.fill(nullptr);
    alloc.d_matter_mem = nullptr;
    alloc.d_em_source_mem = nullptr;
    alloc.d_comm_mem = nullptr;
    alloc.h_comm_mem = nullptr;
    alloc.cap_all = 0;
    alloc.cap_comm = 0;
    alloc.h_comm_pinned = false;
    alloc.cap_h_comm = 0;
    return alloc;
}

static bool has_step_allocation(const StepAllocation &alloc)
{
    return alloc.cap_all != 0;
}

static StepAllocation detach_step_allocation(StepContext &ctx)
{
    StepAllocation alloc = {};
    alloc.d_state0_mem = ctx.d_state0_mem;
    alloc.d_accum_mem = ctx.d_accum_mem;
    alloc.d_resident_mem = ctx.d_resident_mem;
    alloc.d_matter_mem = ctx.d_matter_mem;
    alloc.d_em_source_mem = ctx.d_em_source_mem;
    alloc.d_comm_mem = ctx.d_comm_mem;
    alloc.h_comm_mem = ctx.h_comm_mem;
    alloc.cap_all = ctx.cap_all;
    alloc.cap_comm = ctx.cap_comm;
    alloc.h_comm_pinned = ctx.h_comm_pinned;
    alloc.cap_h_comm = ctx.cap_h_comm;
    ctx.d_state0_mem = nullptr;
    ctx.d_accum_mem = nullptr;
    ctx.d_state_curr_mem = nullptr;
    ctx.d_state_next_mem = nullptr;
    ctx.d_resident_mem.fill(nullptr);
    ctx.d_matter_mem = nullptr;
    ctx.d_em_source_mem = nullptr;
    ctx.d_comm_mem = nullptr;
    ctx.h_comm_mem = nullptr;
    ctx.cap_all = 0;
    ctx.cap_comm = 0;
    ctx.h_comm_pinned = false;
    ctx.cap_h_comm = 0;
    ctx.matter_ready = false;
    ctx.em_source_ready = false;
    ctx.em_zero_fast_known = false;
    ctx.em_zero_fast = false;
    ctx.state_ready = false;
    ctx.current_bank = -1;
    ctx.resident_clock = 0;
    ctx.d_state0.fill(nullptr);
    ctx.d_accum.fill(nullptr);
    ctx.d_state_curr.fill(nullptr);
    ctx.d_state_next.fill(nullptr);
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        ctx.d_resident[b].fill(nullptr);
        ctx.resident_host[b].fill(nullptr);
        ctx.resident_host_clean[b].fill(0);
    }
    ctx.resident_age.fill(0);
    ctx.resident_valid.fill(false);
    ctx.d_matter.fill(nullptr);
    ctx.d_em_source.fill(nullptr);
    ctx.em_source_host.fill(nullptr);
    return alloc;
}

static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc)
{
    ctx.d_state0_mem = alloc.d_state0_mem;
    ctx.d_accum_mem = alloc.d_accum_mem;
    ctx.d_resident_mem = alloc.d_resident_mem;
    ctx.d_state_curr_mem = nullptr;
    ctx.d_state_next_mem = nullptr;
    ctx.d_matter_mem = alloc.d_matter_mem;
    ctx.d_em_source_mem = alloc.d_em_source_mem;
    ctx.d_comm_mem = alloc.d_comm_mem;
    ctx.h_comm_mem = alloc.h_comm_mem;
    ctx.cap_all = alloc.cap_all;
    ctx.cap_comm = alloc.cap_comm;
    ctx.h_comm_pinned = alloc.h_comm_pinned;
    ctx.cap_h_comm = alloc.cap_h_comm;
    ctx.matter_ready = false;
    ctx.em_source_ready = false;
    ctx.em_zero_fast_known = false;
    ctx.em_zero_fast = false;
    ctx.state_ready = false;
    ctx.current_bank = -1;
    ctx.resident_clock = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        ctx.resident_host[b].fill(nullptr);
        ctx.resident_host_clean[b].fill(0);
    }
    ctx.resident_age.fill(0);
    ctx.resident_valid.fill(false);
    ctx.em_source_host.fill(nullptr);
}

static void recycle_step_allocation(StepAllocation &alloc)
{
    if (!has_step_allocation(alloc)) return;
    g_step_pool.push_back(alloc);
    alloc = empty_step_allocation();
}

static StepAllocation acquire_step_allocation(size_t all)
{
    size_t best = g_step_pool.size();
    for (size_t i = 0; i < g_step_pool.size(); ++i) {
        if (g_step_pool[i].cap_all < all) continue;
        if (best == g_step_pool.size() || g_step_pool[i].cap_all < g_step_pool[best].cap_all)
            best = i;
    }
    if (best == g_step_pool.size())
        return empty_step_allocation();

    StepAllocation alloc = g_step_pool[best];
    g_step_pool[best] = g_step_pool.back();
    g_step_pool.pop_back();
    return alloc;
}

static void ensure_gpu_buffers(int nx, int ny, int nz) {
    size_t all = (size_t)nx * ny * nz;
    size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
    size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3);
    const bool need_grow = (!g_buf.initialized)
                        || (all > g_buf.cap_all)
                        || (fh2_size > g_buf.cap_fh2_size)
                        || (fh3_size > g_buf.cap_fh3_size);

    if (need_grow) {
        if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
        if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
        if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
        if (g_buf.h_stage) {
            if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage);
            else free(g_buf.h_stage);
            g_buf.h_stage = nullptr;
            g_buf.h_stage_pinned = false;
        }

        CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));

        const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double);
        cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes);
        if (stage_err == cudaSuccess) {
            g_buf.h_stage_pinned = true;
        } else {
            g_buf.h_stage = (double *)malloc(stage_bytes);
            g_buf.h_stage_pinned = false;
            if (!g_buf.h_stage) {
                fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes);
                exit(EXIT_FAILURE);
            }
        }

        g_buf.cap_all = all;
        g_buf.cap_fh2_size = fh2_size;
        g_buf.cap_fh3_size = fh3_size;
        g_buf.initialized = true;
    }

    for (int s = 0; s < NUM_USED_SLOTS; ++s)
        g_buf.slot[s] = g_buf.d_mem + s * all;

    g_buf.prev_nx = nx;
    g_buf.prev_ny = ny;
    g_buf.prev_nz = nz;
}

static StepContext &ensure_step_ctx(void *block_tag, size_t all)
{
    StepContext &ctx = g_step_ctx[block_tag];
    if (ctx.cap_all < all) {
        StepAllocation old_alloc = detach_step_allocation(ctx);
        recycle_step_allocation(old_alloc);

        StepAllocation alloc = acquire_step_allocation(all);
        if (!has_step_allocation(alloc)) {
            CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
            CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
            for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
                CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b],
                                      BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
            }
            CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
            CUDA_CHECK(cudaMalloc(&alloc.d_em_source_mem, BSSN_EM_SOURCE_COUNT * all * sizeof(double)));
            alloc.cap_all = all;
        }
        attach_step_allocation(ctx, alloc);
    }
    for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) {
        ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
        ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
            ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
        }
    }
    if (ctx.current_bank >= 0) {
        ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
        ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
    }
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
    }
    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
        ctx.d_em_source[i] = ctx.d_em_source_mem + (size_t)i * all;
    }
    return ctx;
}

static void release_step_ctx(void *block_tag)
{
    auto it = g_step_ctx.find(block_tag);
    if (it == g_step_ctx.end()) return;
    StepAllocation alloc = detach_step_allocation(it->second);
    recycle_step_allocation(alloc);
    g_step_ctx.erase(it);
}

static double *ensure_step_comm_buffer(StepContext &ctx, size_t needed_doubles)
{
    if (needed_doubles == 0) return nullptr;
    if (ctx.cap_comm < needed_doubles) {
        if (ctx.d_comm_mem) {
            CUDA_CHECK(cudaFree(ctx.d_comm_mem));
            ctx.d_comm_mem = nullptr;
        }
        CUDA_CHECK(cudaMalloc(&ctx.d_comm_mem, needed_doubles * sizeof(double)));
        ctx.cap_comm = needed_doubles;
    }
    return ctx.d_comm_mem;
}

static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doubles)
{
    if (needed_doubles == 0) return nullptr;
    if (ctx.cap_h_comm < needed_doubles) {
        if (ctx.h_comm_mem) {
            if (ctx.h_comm_pinned) cudaFreeHost(ctx.h_comm_mem);
            else free(ctx.h_comm_mem);
            ctx.h_comm_mem = nullptr;
            ctx.h_comm_pinned = false;
        }

        const size_t bytes = needed_doubles * sizeof(double);
        cudaError_t err = cudaMallocHost((void **)&ctx.h_comm_mem, bytes);
        if (err == cudaSuccess) {
            ctx.h_comm_pinned = true;
        } else {
            ctx.h_comm_mem = (double *)malloc(bytes);
            ctx.h_comm_pinned = false;
            if (!ctx.h_comm_mem) {
                fprintf(stderr, "Host comm allocation failed (%zu bytes)\n", bytes);
                exit(EXIT_FAILURE);
            }
        }
        ctx.cap_h_comm = needed_doubles;
    }
    return ctx.h_comm_mem;
}

static int *ensure_comm_segment_meta_buffer(size_t needed_ints)
{
    if (needed_ints == 0) return nullptr;
    if (g_comm_segment_meta_cap < needed_ints) {
        if (g_comm_segment_meta) {
            CUDA_CHECK(cudaFree(g_comm_segment_meta));
            g_comm_segment_meta = nullptr;
        }
        CUDA_CHECK(cudaMalloc(&g_comm_segment_meta, needed_ints * sizeof(int)));
        g_comm_segment_meta_cap = needed_ints;
    }
    return g_comm_segment_meta;
}

static void upload_comm_state_soa(const double *state_soa, int state_count)
{
    double soa[3 * BSSN_RESIDENT_STATE_CAPACITY];
    for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) {
        soa[3 * i + 0] = 1.0;
        soa[3 * i + 1] = 1.0;
        soa[3 * i + 2] = 1.0;
    }
    if (state_soa) {
        const int n = (state_count < BSSN_RESIDENT_STATE_CAPACITY) ? state_count : BSSN_RESIDENT_STATE_CAPACITY;
        for (int i = 0; i < n; ++i) {
            soa[3 * i + 0] = state_soa[3 * i + 0];
            soa[3 * i + 1] = state_soa[3 * i + 1];
            soa[3 * i + 2] = state_soa[3 * i + 2];
        }
    }
    CUDA_CHECK(cudaMemcpyToSymbol(d_comm_state_soa, soa, sizeof(soa)));
}

static void upload_grid_params_if_needed(const GridParams &gp)
{
    if (!g_gp_host_cache_valid ||
        std::memcmp(&g_gp_host_cache, &gp, sizeof(GridParams)) != 0) {
        CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
        g_gp_host_cache = gp;
        g_gp_host_cache_valid = true;
    }
}

/* ================================================================== */
/*  A. Symmetry boundary kernels (ord=2 and ord=3)                     */
/* ================================================================== */

/* Step 1: Copy interior into ghost-padded array */
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
                                               double * __restrict__ fh,
                                               double SoA0, double SoA1, double SoA2)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < d_gp.all;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = tid % nx;
        int j0 = (tid / nx) % ny;
        int k0 = tid / (nx * ny);
        int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
        fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid];
    }
}

/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
                                     double * __restrict__ fh,
                                     double SoA0, double SoA1, double SoA2)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
    const int total = fnx * fny * fnz;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total;
         tid += blockDim.x * gridDim.x)
    {
        int ii = tid % fnx;
        int jj = (tid / fnx) % fny;
        int kk = tid / (fnx * fny);

        int iF = ii - 1;   /* -1 .. nx */
        int jF = jj - 1;   /* -1 .. ny */
        int kF = kk - 1;   /* -1 .. nz */

        int siF = (iF <= 0) ? (1 - iF) : iF;  /* 1..nx */
        int sjF = (jF <= 0) ? (1 - jF) : jF;  /* 1..ny */
        int skF = (kF <= 0) ? (1 - kF) : kF;  /* 1..nz */

        double sign = 1.0;
        if (iF <= 0) sign *= SoA0;
        if (jF <= 0) sign *= SoA1;
        if (kF <= 0) sign *= SoA2;

        int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
        fh[tid] = sign * func[src];
    }
}

/* Step 2: Fill i-ghosts (x-direction symmetry) */
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
{
    const int ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
    /* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */
    /* Fortran: do ii=0,ord-1: funcc(-ii,jF,kF) = funcc(ii+1,jF,kF)*SoA[0] */
    int total = ny * nz;  /* jF=1..ny, kF=1..nz */
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total * 2;  /* 2 ghost layers */
         tid += blockDim.x * gridDim.x)
    {
        int ii = tid / total;  /* 0 or 1 */
        int rem = tid % total;
        int j0 = rem % ny;
        int k0 = rem / ny;
        int jF = j0 + 1, kF = k0 + 1;
        int iF_dst = -ii;     /* 0, -1 */
        int iF_src = ii + 1;  /* 1, 2  */
        fh[(iF_dst+1) + (jF+1)*fnx + (kF+1)*fnx*fny] =
            fh[(iF_src+1) + (jF+1)*fnx + (kF+1)*fnx*fny] * SoA0;
    }
}

/* Step 3: Fill j-ghosts (y-direction symmetry) */
__global__ void kern_symbd_jghost_ord2(double * __restrict__ fh, double SoA1)
{
    const int nx = d_gp.ex[0], nz = d_gp.ex[2];
    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
    /* iF ranges from -1 to nx (i.e. -ord+1 to ex1), total = nx+2 */
    int irange = nx + 2;
    int total = irange * nz;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total * 2;
         tid += blockDim.x * gridDim.x)
    {
        int jj = tid / total;
        int rem = tid % total;
        int ii = rem % irange;
        int k0 = rem / irange;
        int iF = ii - 1;  /* -1 .. nx */
        int kF = k0 + 1;
        int jF_dst = -jj;
        int jF_src = jj + 1;
        fh[(iF+1) + (jF_dst+1)*fnx + (kF+1)*fnx*fny] =
            fh[(iF+1) + (jF_src+1)*fnx + (kF+1)*fnx*fny] * SoA1;
    }
}

/* Step 4: Fill k-ghosts (z-direction symmetry) */
__global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
    const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
    int irange = nx + 2;
    int jrange = ny + 2;
    int total = irange * jrange;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total * 2;
         tid += blockDim.x * gridDim.x)
    {
        int kk = tid / total;
        int rem = tid % total;
        int ii = rem % irange;
        int jj = rem / irange;
        int iF = ii - 1;
        int jF = jj - 1;
        int kF_dst = -kk;
        int kF_src = kk + 1;
        fh[(iF+1) + (jF+1)*fnx + (kF_dst+1)*fnx*fny] =
            fh[(iF+1) + (jF+1)*fnx + (kF_src+1)*fnx*fny] * SoA2;
    }
}

/* ---- ord=3 variants (for lopsided / kodis) ---- */

__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
                                               double * __restrict__ fh)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < d_gp.all;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = tid % nx;
        int j0 = (tid / nx) % ny;
        int k0 = tid / (nx * ny);
        int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
        fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid];
    }
}

/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
                                     double * __restrict__ fh,
                                     double SoA0, double SoA1, double SoA2)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
    const int total = fnx * fny * fnz;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total;
         tid += blockDim.x * gridDim.x)
    {
        int ii = tid % fnx;
        int jj = (tid / fnx) % fny;
        int kk = tid / (fnx * fny);

        int iF = ii - 2;   /* -2 .. nx */
        int jF = jj - 2;   /* -2 .. ny */
        int kF = kk - 2;   /* -2 .. nz */

        int siF = (iF <= 0) ? (1 - iF) : iF;  /* 1..nx */
        int sjF = (jF <= 0) ? (1 - jF) : jF;  /* 1..ny */
        int skF = (kF <= 0) ? (1 - kF) : kF;  /* 1..nz */

        double sign = 1.0;
        if (iF <= 0) sign *= SoA0;
        if (jF <= 0) sign *= SoA1;
        if (kF <= 0) sign *= SoA2;

        int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
        fh[tid] = sign * func[src];
    }
}

__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
{
    const int ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
    int total = ny * nz;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total * 3;
         tid += blockDim.x * gridDim.x)
    {
        int ii = tid / total;
        int rem = tid % total;
        int j0 = rem % ny;
        int k0 = rem / ny;
        int jF = j0 + 1, kF = k0 + 1;
        int iF_dst = -ii;
        int iF_src = ii + 1;
        fh[(iF_dst+2) + (jF+2)*fnx + (kF+2)*fnx*fny] =
            fh[(iF_src+2) + (jF+2)*fnx + (kF+2)*fnx*fny] * SoA0;
    }
}

__global__ void kern_symbd_jghost_ord3(double * __restrict__ fh, double SoA1)
{
    const int nx = d_gp.ex[0], nz = d_gp.ex[2];
    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
    int irange = nx + 3;
    int total = irange * nz;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total * 3;
         tid += blockDim.x * gridDim.x)
    {
        int jj = tid / total;
        int rem = tid % total;
        int ii = rem % irange;
        int k0 = rem / irange;
        int iF = ii - 2;
        int kF = k0 + 1;
        int jF_dst = -jj;
        int jF_src = jj + 1;
        fh[(iF+2) + (jF_dst+2)*fnx + (kF+2)*fnx*fny] =
            fh[(iF+2) + (jF_src+2)*fnx + (kF+2)*fnx*fny] * SoA1;
    }
}

__global__ void kern_symbd_kghost_ord3(double * __restrict__ fh, double SoA2)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1];
    const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
    int irange = nx + 3;
    int jrange = ny + 3;
    int total = irange * jrange;
    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < total * 3;
         tid += blockDim.x * gridDim.x)
    {
        int kk = tid / total;
        int rem = tid % total;
        int ii = rem % irange;
        int jj = rem / irange;
        int iF = ii - 2;
        int jF = jj - 2;
        int kF_dst = -kk;
        int kF_src = kk + 1;
        fh[(iF+2) + (jF+2)*fnx + (kF_dst+2)*fnx*fny] =
            fh[(iF+2) + (jF+2)*fnx + (kF_src+2)*fnx*fny] * SoA2;
    }
}

/* ================================================================== */
/*  B. Stencil kernels                                                 */
/* ================================================================== */

/* ---- First derivatives (ord=2, 4th/2nd order) ---- */
__global__ __launch_bounds__(128, 4)
void kern_fderivs(const double * __restrict__ fh,
                  double * __restrict__ fx,
                  double * __restrict__ fy,
                  double * __restrict__ fz)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < d_gp.all;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = tid % nx;
        int j0 = (tid / nx) % ny;
        int k0 = tid / (nx * ny);

        /* boundary points: leave as zero */
        if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
            fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0;
            continue;
        }

        int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;

        if ((iF+2) <= imaxF && (iF-2) >= iminF &&
            (jF+2) <= jmaxF && (jF-2) >= jminF &&
            (kF+2) <= kmaxF && (kF-2) >= kminF)
        {
            fx[tid] = d_gp.d12dx * (
                fh[idx_fh2(iF-2,jF,kF)] - 8.0*fh[idx_fh2(iF-1,jF,kF)]
              + 8.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]);
            fy[tid] = d_gp.d12dy * (
                fh[idx_fh2(iF,jF-2,kF)] - 8.0*fh[idx_fh2(iF,jF-1,kF)]
              + 8.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]);
            fz[tid] = d_gp.d12dz * (
                fh[idx_fh2(iF,jF,kF-2)] - 8.0*fh[idx_fh2(iF,jF,kF-1)]
              + 8.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]);
        }
        else if ((iF+1) <= imaxF && (iF-1) >= iminF &&
                 (jF+1) <= jmaxF && (jF-1) >= jminF &&
                 (kF+1) <= kmaxF && (kF-1) >= kminF)
        {
            fx[tid] = d_gp.d2dx * (
                -fh[idx_fh2(iF-1,jF,kF)] + fh[idx_fh2(iF+1,jF,kF)]);
            fy[tid] = d_gp.d2dy * (
                -fh[idx_fh2(iF,jF-1,kF)] + fh[idx_fh2(iF,jF+1,kF)]);
            fz[tid] = d_gp.d2dz * (
                -fh[idx_fh2(iF,jF,kF-1)] + fh[idx_fh2(iF,jF,kF+1)]);
        }
        else {
            fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0;
        }
    }
}

/* ---- Second derivatives (ord=2, 4th/2nd order) ---- */
__global__ __launch_bounds__(128, 4)
void kern_fdderivs(const double * __restrict__ fh,
                   double * __restrict__ fxx, double * __restrict__ fxy,
                   double * __restrict__ fxz, double * __restrict__ fyy,
                   double * __restrict__ fyz, double * __restrict__ fzz)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < d_gp.all;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = tid % nx;
        int j0 = (tid / nx) % ny;
        int k0 = tid / (nx * ny);

        if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
            fxx[tid]=0; fxy[tid]=0; fxz[tid]=0;
            fyy[tid]=0; fyz[tid]=0; fzz[tid]=0;
            continue;
        }

        int iF = i0+1, jF = j0+1, kF = k0+1;

        if ((iF+2)<=imaxF && (iF-2)>=iminF &&
            (jF+2)<=jmaxF && (jF-2)>=jminF &&
            (kF+2)<=kmaxF && (kF-2)>=kminF)
        {
            /* 4th-order diagonal */
            double c = fh[idx_fh2(iF,jF,kF)];
            fxx[tid] = d_gp.Fdxdx*(
                -fh[idx_fh2(iF-2,jF,kF)] + 16.0*fh[idx_fh2(iF-1,jF,kF)]
                -30.0*c + 16.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]);
            fyy[tid] = d_gp.Fdydy*(
                -fh[idx_fh2(iF,jF-2,kF)] + 16.0*fh[idx_fh2(iF,jF-1,kF)]
                -30.0*c + 16.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]);
            fzz[tid] = d_gp.Fdzdz*(
                -fh[idx_fh2(iF,jF,kF-2)] + 16.0*fh[idx_fh2(iF,jF,kF-1)]
                -30.0*c + 16.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]);

            /* 4th-order cross: fxy */
            {
                double t_jm2 = fh[idx_fh2(iF-2,jF-2,kF)] - 8.0*fh[idx_fh2(iF-1,jF-2,kF)]
                             + 8.0*fh[idx_fh2(iF+1,jF-2,kF)] - fh[idx_fh2(iF+2,jF-2,kF)];
                double t_jm1 = fh[idx_fh2(iF-2,jF-1,kF)] - 8.0*fh[idx_fh2(iF-1,jF-1,kF)]
                             + 8.0*fh[idx_fh2(iF+1,jF-1,kF)] - fh[idx_fh2(iF+2,jF-1,kF)];
                double t_jp1 = fh[idx_fh2(iF-2,jF+1,kF)] - 8.0*fh[idx_fh2(iF-1,jF+1,kF)]
                             + 8.0*fh[idx_fh2(iF+1,jF+1,kF)] - fh[idx_fh2(iF+2,jF+1,kF)];
                double t_jp2 = fh[idx_fh2(iF-2,jF+2,kF)] - 8.0*fh[idx_fh2(iF-1,jF+2,kF)]
                             + 8.0*fh[idx_fh2(iF+1,jF+2,kF)] - fh[idx_fh2(iF+2,jF+2,kF)];
                fxy[tid] = d_gp.Fdxdy*(t_jm2 - 8.0*t_jm1 + 8.0*t_jp1 - t_jp2);
            }
            /* 4th-order cross: fxz */
            {
                double t_km2 = fh[idx_fh2(iF-2,jF,kF-2)] - 8.0*fh[idx_fh2(iF-1,jF,kF-2)]
                             + 8.0*fh[idx_fh2(iF+1,jF,kF-2)] - fh[idx_fh2(iF+2,jF,kF-2)];
                double t_km1 = fh[idx_fh2(iF-2,jF,kF-1)] - 8.0*fh[idx_fh2(iF-1,jF,kF-1)]
                             + 8.0*fh[idx_fh2(iF+1,jF,kF-1)] - fh[idx_fh2(iF+2,jF,kF-1)];
                double t_kp1 = fh[idx_fh2(iF-2,jF,kF+1)] - 8.0*fh[idx_fh2(iF-1,jF,kF+1)]
                             + 8.0*fh[idx_fh2(iF+1,jF,kF+1)] - fh[idx_fh2(iF+2,jF,kF+1)];
                double t_kp2 = fh[idx_fh2(iF-2,jF,kF+2)] - 8.0*fh[idx_fh2(iF-1,jF,kF+2)]
                             + 8.0*fh[idx_fh2(iF+1,jF,kF+2)] - fh[idx_fh2(iF+2,jF,kF+2)];
                fxz[tid] = d_gp.Fdxdz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2);
            }
            /* 4th-order cross: fyz */
            {
                double t_km2 = fh[idx_fh2(iF,jF-2,kF-2)] - 8.0*fh[idx_fh2(iF,jF-1,kF-2)]
                             + 8.0*fh[idx_fh2(iF,jF+1,kF-2)] - fh[idx_fh2(iF,jF+2,kF-2)];
                double t_km1 = fh[idx_fh2(iF,jF-2,kF-1)] - 8.0*fh[idx_fh2(iF,jF-1,kF-1)]
                             + 8.0*fh[idx_fh2(iF,jF+1,kF-1)] - fh[idx_fh2(iF,jF+2,kF-1)];
                double t_kp1 = fh[idx_fh2(iF,jF-2,kF+1)] - 8.0*fh[idx_fh2(iF,jF-1,kF+1)]
                             + 8.0*fh[idx_fh2(iF,jF+1,kF+1)] - fh[idx_fh2(iF,jF+2,kF+1)];
                double t_kp2 = fh[idx_fh2(iF,jF-2,kF+2)] - 8.0*fh[idx_fh2(iF,jF-1,kF+2)]
                             + 8.0*fh[idx_fh2(iF,jF+1,kF+2)] - fh[idx_fh2(iF,jF+2,kF+2)];
                fyz[tid] = d_gp.Fdydz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2);
            }
        }
        else if ((iF+1)<=imaxF && (iF-1)>=iminF &&
                 (jF+1)<=jmaxF && (jF-1)>=jminF &&
                 (kF+1)<=kmaxF && (kF-1)>=kminF)
        {
            double c = fh[idx_fh2(iF,jF,kF)];
            fxx[tid] = d_gp.Sdxdx*(fh[idx_fh2(iF-1,jF,kF)] - 2.0*c + fh[idx_fh2(iF+1,jF,kF)]);
            fyy[tid] = d_gp.Sdydy*(fh[idx_fh2(iF,jF-1,kF)] - 2.0*c + fh[idx_fh2(iF,jF+1,kF)]);
            fzz[tid] = d_gp.Sdzdz*(fh[idx_fh2(iF,jF,kF-1)] - 2.0*c + fh[idx_fh2(iF,jF,kF+1)]);
            fxy[tid] = d_gp.Sdxdy*(fh[idx_fh2(iF-1,jF-1,kF)] - fh[idx_fh2(iF+1,jF-1,kF)]
                                   -fh[idx_fh2(iF-1,jF+1,kF)] + fh[idx_fh2(iF+1,jF+1,kF)]);
            fxz[tid] = d_gp.Sdxdz*(fh[idx_fh2(iF-1,jF,kF-1)] - fh[idx_fh2(iF+1,jF,kF-1)]
                                   -fh[idx_fh2(iF-1,jF,kF+1)] + fh[idx_fh2(iF+1,jF,kF+1)]);
            fyz[tid] = d_gp.Sdydz*(fh[idx_fh2(iF,jF-1,kF-1)] - fh[idx_fh2(iF,jF+1,kF-1)]
                                   -fh[idx_fh2(iF,jF-1,kF+1)] + fh[idx_fh2(iF,jF+1,kF+1)]);
        }
        else {
            fxx[tid]=0; fxy[tid]=0; fxz[tid]=0;
            fyy[tid]=0; fyz[tid]=0; fzz[tid]=0;
        }
    }
}

/* ---- Lopsided (upwind advection) kernel ---- */
__global__ __launch_bounds__(128, 4)
void kern_lopsided(const double * __restrict__ fh,
                   double * __restrict__ f_rhs,
                   const double * __restrict__ Sfx,
                   const double * __restrict__ Sfy,
                   const double * __restrict__ Sfz)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < d_gp.all;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = tid % nx;
        int j0 = (tid / nx) % ny;
        int k0 = tid / (nx * ny);

        if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) continue;

        int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
        double val = 0.0;

        /* --- x direction --- */
        double sfx = Sfx[tid];
        if (sfx > 0.0) {
            if (i0 <= nx - 4) {
                val += sfx * d_gp.d12dx * (
                    -3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)]
                    + fh[idx_fh3(iF+3,jF,kF)]);
            } else if (i0 <= nx - 3) {
                val += sfx * d_gp.d12dx * (
                    fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)]
                    +8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]);
            } else if (i0 <= nx - 2) {
                val -= sfx * d_gp.d12dx * (
                    -3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)]
                    + fh[idx_fh3(iF-3,jF,kF)]);
            }
        } else if (sfx < 0.0) {
            if ((i0 - 2) >= iminF) {
                val -= sfx * d_gp.d12dx * (
                    -3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)]
                    + fh[idx_fh3(iF-3,jF,kF)]);
            } else if ((i0 - 1) >= iminF) {
                val += sfx * d_gp.d12dx * (
                    fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)]
                    +8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]);
            } else if (i0 >= iminF) {
                val += sfx * d_gp.d12dx * (
                    -3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)]
                    + fh[idx_fh3(iF+3,jF,kF)]);
            }
        }

        /* --- y direction --- */
        double sfy = Sfy[tid];
        if (sfy > 0.0) {
            if (j0 <= ny - 4) {
                val += sfy * d_gp.d12dy * (
                    -3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)]
                    + fh[idx_fh3(iF,jF+3,kF)]);
            } else if (j0 <= ny - 3) {
                val += sfy * d_gp.d12dy * (
                    fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)]
                    +8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]);
            } else if (j0 <= ny - 2) {
                val -= sfy * d_gp.d12dy * (
                    -3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)]
                    + fh[idx_fh3(iF,jF-3,kF)]);
            }
        } else if (sfy < 0.0) {
            if ((j0 - 2) >= jminF) {
                val -= sfy * d_gp.d12dy * (
                    -3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)]
                    + fh[idx_fh3(iF,jF-3,kF)]);
            } else if ((j0 - 1) >= jminF) {
                val += sfy * d_gp.d12dy * (
                    fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)]
                    +8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]);
            } else if (j0 >= jminF) {
                val += sfy * d_gp.d12dy * (
                    -3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)]
                    + fh[idx_fh3(iF,jF+3,kF)]);
            }
        }

        /* --- z direction --- */
        double sfz = Sfz[tid];
        if (sfz > 0.0) {
            if (k0 <= nz - 4) {
                val += sfz * d_gp.d12dz * (
                    -3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)]
                    + fh[idx_fh3(iF,jF,kF+3)]);
            } else if (k0 <= nz - 3) {
                val += sfz * d_gp.d12dz * (
                    fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)]
                    +8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]);
            } else if (k0 <= nz - 2) {
                val -= sfz * d_gp.d12dz * (
                    -3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)]
                    + fh[idx_fh3(iF,jF,kF-3)]);
            }
        } else if (sfz < 0.0) {
            if ((k0 - 2) >= kminF) {
                val -= sfz * d_gp.d12dz * (
                    -3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)]
                    + fh[idx_fh3(iF,jF,kF-3)]);
            } else if ((k0 - 1) >= kminF) {
                val += sfz * d_gp.d12dz * (
                    fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)]
                    +8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]);
            } else if (k0 >= kminF) {
                val += sfz * d_gp.d12dz * (
                    -3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
                    +18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)]
                    + fh[idx_fh3(iF,jF,kF+3)]);
            }
        }

        f_rhs[tid] += val;
    }
}

/* ---- KO dissipation kernel (ord=3, 6th-order) ---- */
__global__ __launch_bounds__(128, 4)
void kern_kodis(const double * __restrict__ fh,
                double * __restrict__ f_rhs,
                double eps_val)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const double cof = 64.0;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < d_gp.all;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = tid % nx;
        int j0 = (tid / nx) % ny;
        int k0 = tid / (nx * ny);
        int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;

        if ((iF-3) >= iminF && (iF+3) <= imaxF &&
            (jF-3) >= jminF && (jF+3) <= jmaxF &&
            (kF-3) >= kminF && (kF+3) <= kmaxF)
        {
            double Dx = (fh[idx_fh3(iF-3,jF,kF)] + fh[idx_fh3(iF+3,jF,kF)])
                - 6.0*(fh[idx_fh3(iF-2,jF,kF)] + fh[idx_fh3(iF+2,jF,kF)])
                +15.0*(fh[idx_fh3(iF-1,jF,kF)] + fh[idx_fh3(iF+1,jF,kF)])
                -20.0* fh[idx_fh3(iF,jF,kF)];
            Dx /= d_gp.dX;

            double Dy = (fh[idx_fh3(iF,jF-3,kF)] + fh[idx_fh3(iF,jF+3,kF)])
                - 6.0*(fh[idx_fh3(iF,jF-2,kF)] + fh[idx_fh3(iF,jF+2,kF)])
                +15.0*(fh[idx_fh3(iF,jF-1,kF)] + fh[idx_fh3(iF,jF+1,kF)])
                -20.0* fh[idx_fh3(iF,jF,kF)];
            Dy /= d_gp.dY;

            double Dz = (fh[idx_fh3(iF,jF,kF-3)] + fh[idx_fh3(iF,jF,kF+3)])
                - 6.0*(fh[idx_fh3(iF,jF,kF-2)] + fh[idx_fh3(iF,jF,kF+2)])
                +15.0*(fh[idx_fh3(iF,jF,kF-1)] + fh[idx_fh3(iF,jF,kF+1)])
                -20.0* fh[idx_fh3(iF,jF,kF)];
            Dz /= d_gp.dZ;

            f_rhs[tid] += (eps_val / cof) * (Dx + Dy + Dz);
        }
    }
}

/* ================================================================== */
/*  Host wrapper helpers                                               */
/* ================================================================== */
struct LopsidedKodisTables {
    const double *adv_fields[BSSN_EM_LK_FIELD_COUNT];
    const double *ko_fields[BSSN_EM_LK_FIELD_COUNT];
    double *rhs_fields[BSSN_EM_LK_FIELD_COUNT];
    int soa_signs[3 * BSSN_EM_LK_FIELD_COUNT];
};

struct FDerivTables {
    const double *src_fields[BSSN_STATE_COUNT];
    double *fx_fields[BSSN_STATE_COUNT];
    double *fy_fields[BSSN_STATE_COUNT];
    double *fz_fields[BSSN_STATE_COUNT];
    int soa_signs[3 * BSSN_STATE_COUNT];
};

struct FDDerivTables {
    const double *src_fields[BSSN_STATE_COUNT];
    double *fxx_fields[BSSN_STATE_COUNT];
    double *fxy_fields[BSSN_STATE_COUNT];
    double *fxz_fields[BSSN_STATE_COUNT];
    double *fyy_fields[BSSN_STATE_COUNT];
    double *fyz_fields[BSSN_STATE_COUNT];
    double *fzz_fields[BSSN_STATE_COUNT];
    int soa_signs[3 * BSSN_STATE_COUNT];
};

static constexpr int PHASE10_METRIC_FIELD_COUNT = 6;

struct Phase10RicciTables {
    const double *src_fields[PHASE10_METRIC_FIELD_COUNT];
    double *dst_fields[PHASE10_METRIC_FIELD_COUNT];
    int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT];
};

struct Rk4FinalizeTables {
    const double *f0_fields[BSSN_EM_STATE_COUNT];
    double *rhs_fields[BSSN_EM_STATE_COUNT];
    double *accum_fields[BSSN_EM_STATE_COUNT];
};

struct PatchBoundaryTables {
    const double *src_fields[BSSN_EM_STATE_COUNT];
    double *dst_fields[BSSN_EM_STATE_COUNT];
};

struct EScalarBoundaryTables {
    const double *f0_fields[BSSN_EM_STATE_COUNT];
    double *out_fields[BSSN_EM_STATE_COUNT];
};

static const int BLK = 128;
static inline int grid(size_t n) {
    if (n == 0) return 1;
    size_t g = (n + BLK - 1) / BLK;
    if (g > 2147483647u) g = 2147483647u;
    return (int)g;
}

__global__ __launch_bounds__(128, 4)
void kern_fderivs_batched(FDerivTables tables, int field_count)
{
    const int field = blockIdx.y;
    if (field >= field_count) return;

    const double *src = tables.src_fields[field];
    double *fx = tables.fx_fields[field];
    double *fy = tables.fy_fields[field];
    double *fz = tables.fz_fields[field];
    const int SoA0 = tables.soa_signs[3 * field + 0];
    const int SoA1 = tables.soa_signs[3 * field + 1];
    const int SoA2 = tables.soa_signs[3 * field + 2];
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;

    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
        fx[tid] = 0.0;
        fy[tid] = 0.0;
        fz[tid] = 0.0;
        return;
    }

    const int iF = i0 + 1;
    const int jF = j0 + 1;
    const int kF = k0 + 1;

    fd_compute_first3(src, iF, jF, kF,
                      iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                      SoA0, SoA1, SoA2,
                      fx[tid], fy[tid], fz[tid]);
}

__global__ __launch_bounds__(128, 4)
void kern_fdderivs_batched(FDDerivTables tables, int field_count)
{
    const int field = blockIdx.y;
    if (field >= field_count) return;

    const double *src = tables.src_fields[field];
    double *fxx = tables.fxx_fields[field];
    double *fxy = tables.fxy_fields[field];
    double *fxz = tables.fxz_fields[field];
    double *fyy = tables.fyy_fields[field];
    double *fyz = tables.fyz_fields[field];
    double *fzz = tables.fzz_fields[field];
    const int SoA0 = tables.soa_signs[3 * field + 0];
    const int SoA1 = tables.soa_signs[3 * field + 1];
    const int SoA2 = tables.soa_signs[3 * field + 2];
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;

    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
        fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
        fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
        return;
    }

    const int iF = i0 + 1;
    const int jF = j0 + 1;
    const int kF = k0 + 1;

#if ghost_width != 3
    fd_compute_second6(src, iF, jF, kF,
                       iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                       SoA0, SoA1, SoA2,
                       fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
#else
    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
    {
        const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
        fxx[tid] = d_gp.Fdxdx * (
            -fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
            +16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
            -30.0 * c
            +16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
            -       fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
        fyy[tid] = d_gp.Fdydy * (
            -fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
            +16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
            -30.0 * c
            +16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
            -       fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
        fzz[tid] = d_gp.Fdzdz * (
            -fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
            +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
            -30.0 * c
            +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
            -       fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));

        const double t_jm2 =
            fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
        const double t_jm1 =
            fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
        const double t_jp1 =
            fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
        const double t_jp2 =
            fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
        fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);

        const double t_km2_x =
            fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
        const double t_km1_x =
            fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
        const double t_kp1_x =
            fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
        const double t_kp2_x =
            fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
        fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);

        const double t_km2_y =
            fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
        const double t_km1_y =
            fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
        const double t_kp1_y =
            fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
        const double t_kp2_y =
            fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
          - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
          + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
          -       fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
        fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
    }
    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
    {
        const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
        fxx[tid] = d_gp.Sdxdx * (
            fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
          - 2.0 * c
          + fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
        fyy[tid] = d_gp.Sdydy * (
            fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
          - 2.0 * c
          + fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
        fzz[tid] = d_gp.Sdzdz * (
            fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
          - 2.0 * c
          + fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
        fxy[tid] = d_gp.Sdxdy * (
            fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
          - fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
          - fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
          + fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
        fxz[tid] = d_gp.Sdxdz * (
            fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
          - fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
          - fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
          + fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
        fyz[tid] = d_gp.Sdydz * (
            fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
          - fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
          - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
          + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
    }
    else {
        fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
        fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
    }
#endif
}

static void gpu_fderivs_batch(int field_count,
                              double *const *src_fields,
                              double *const *fx_fields,
                              double *const *fy_fields,
                              double *const *fz_fields,
                              const int *soa_signs,
                              int all);
static void gpu_fdderivs_batch(int field_count,
                               double *const *src_fields,
                               double *const *fxx_fields,
                               double *const *fxy_fields,
                               double *const *fxz_fields,
                               double *const *fyy_fields,
                               double *const *fyz_fields,
                               double *const *fzz_fields,
                               const int *soa_signs,
                               int all);
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
                                            double *d_Sfx, double *d_Sfy, double *d_Sfz,
                                            double SoA0, double SoA1, double SoA2,
                                            double eps_val, int all);

/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
                        double SoA0, double SoA1, double SoA2, int all)
{
#if ghost_width != 3
    double *src_fields[1] = {d_f};
    double *fx_fields[1] = {d_fx};
    double *fy_fields[1] = {d_fy};
    double *fz_fields[1] = {d_fz};
    const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
    gpu_fderivs_batch(1, src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all);
#else
    double *fh = g_buf.d_fh2;
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
    const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);

    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
#endif
}

/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
static void gpu_fdderivs(double *d_f,
                         double *d_fxx, double *d_fxy, double *d_fxz,
                         double *d_fyy, double *d_fyz, double *d_fzz,
                         double SoA0, double SoA1, double SoA2, int all)
{
#if ghost_width != 3
    double *src_fields[1] = {d_f};
    double *fxx_fields[1] = {d_fxx};
    double *fxy_fields[1] = {d_fxy};
    double *fxz_fields[1] = {d_fxz};
    double *fyy_fields[1] = {d_fyy};
    double *fyz_fields[1] = {d_fyz};
    double *fzz_fields[1] = {d_fzz};
    const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
    gpu_fdderivs_batch(1, src_fields, fxx_fields, fxy_fields, fxz_fields,
                       fyy_fields, fyz_fields, fzz_fields, soa_signs, all);
#else
    double *fh = g_buf.d_fh2;
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
    const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);

    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
    kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
#endif
}

static void gpu_fderivs_batch(int field_count,
                              double *const *src_fields,
                              double *const *fx_fields,
                              double *const *fy_fields,
                              double *const *fz_fields,
                              const int *soa_signs,
                              int all)
{
    if (field_count <= 0) return;
    FDerivTables tables = {};
    for (int i = 0; i < field_count; ++i) {
        tables.src_fields[i] = src_fields[i];
        tables.fx_fields[i] = fx_fields[i];
        tables.fy_fields[i] = fy_fields[i];
        tables.fz_fields[i] = fz_fields[i];
        tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
        tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
        tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
    }
    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
    kern_fderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
}

static void gpu_fdderivs_batch(int field_count,
                               double *const *src_fields,
                               double *const *fxx_fields,
                               double *const *fxy_fields,
                               double *const *fxz_fields,
                               double *const *fyy_fields,
                               double *const *fyz_fields,
                               double *const *fzz_fields,
                               const int *soa_signs,
                               int all)
{
    if (field_count <= 0) return;
    FDDerivTables tables = {};
    for (int i = 0; i < field_count; ++i) {
        tables.src_fields[i] = src_fields[i];
        tables.fxx_fields[i] = fxx_fields[i];
        tables.fxy_fields[i] = fxy_fields[i];
        tables.fxz_fields[i] = fxz_fields[i];
        tables.fyy_fields[i] = fyy_fields[i];
        tables.fyz_fields[i] = fyz_fields[i];
        tables.fzz_fields[i] = fzz_fields[i];
        tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
        tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
        tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
    }
    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
    kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
}

__global__ __launch_bounds__(128, 4)
void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
                                const double * __restrict__ gupxy,
                                const double * __restrict__ gupxz,
                                const double * __restrict__ gupyy,
                                const double * __restrict__ gupyz,
                                const double * __restrict__ gupzz,
                                Phase10RicciTables tables)
{
    const int field = blockIdx.y;
    if (field >= PHASE10_METRIC_FIELD_COUNT) return;

    const double *src = tables.src_fields[field];
    double *dst = tables.dst_fields[field];
    const int SoA0 = tables.soa_signs[3 * field + 0];
    const int SoA1 = tables.soa_signs[3 * field + 1];
    const int SoA2 = tables.soa_signs[3 * field + 2];
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    double fxx = 0.0, fxy = 0.0, fxz = 0.0;
    double fyy = 0.0, fyz = 0.0, fzz = 0.0;

    if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
        const int iF = i0 + 1;
        const int jF = j0 + 1;
        const int kF = k0 + 1;

#if ghost_width != 3
        fd_compute_second6(src, iF, jF, kF,
                           iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                           SoA0, SoA1, SoA2,
                           fxx, fxy, fxz, fyy, fyz, fzz);
#else
        if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
            (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
            (kF + 2) <= kmaxF && (kF - 2) >= kminF)
        {
            const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
            fxx = d_gp.Fdxdx * (
                -fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
                +16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                -30.0 * c
                +16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                -       fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
            fyy = d_gp.Fdydy * (
                -fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
                +16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                -30.0 * c
                +16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                -       fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
            fzz = d_gp.Fdzdz * (
                -fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
                +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                -30.0 * c
                +16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                -       fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));

            const double t_jm2 =
                fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
            const double t_jm1 =
                fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
            const double t_jp1 =
                fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
            const double t_jp2 =
                fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
            fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);

            const double t_km2_x =
                fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
            const double t_km1_x =
                fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
            const double t_kp1_x =
                fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
            const double t_kp2_x =
                fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
            fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);

            const double t_km2_y =
                fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
            const double t_km1_y =
                fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
            const double t_kp1_y =
                fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
            const double t_kp2_y =
                fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
              - 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
              + 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
              -       fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
            fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
        }
        else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                 (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                 (kF + 1) <= kmaxF && (kF - 1) >= kminF)
        {
            const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
            fxx = d_gp.Sdxdx * (
                fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
              - 2.0 * c
              + fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
            fyy = d_gp.Sdydy * (
                fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
              - 2.0 * c
              + fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
            fzz = d_gp.Sdzdz * (
                fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
              - 2.0 * c
              + fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
            fxy = d_gp.Sdxdy * (
                fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
              - fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
              - fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
              + fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
            fxz = d_gp.Sdxdz * (
                fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
              - fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
              - fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
              + fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
            fyz = d_gp.Sdydz * (
                fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
              - fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
              - fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
              + fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
        }
#endif
    }

    dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
             + 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz);
}

static void gpu_phase10_ricci_batch(const double *gupxx,
                                    const double *gupxy,
                                    const double *gupxz,
                                    const double *gupyy,
                                    const double *gupyz,
                                    const double *gupzz,
                                    double *const *src_fields,
                                    double *const *dst_fields,
                                    const int *soa_signs,
                                    int all)
{
    Phase10RicciTables tables = {};
    for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) {
        tables.src_fields[i] = src_fields[i];
        tables.dst_fields[i] = dst_fields[i];
        tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
        tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
        tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
    }
    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT);
    kern_phase10_ricci_batched<<<launch_grid, BLK>>>(
        gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables);
}

__global__ __launch_bounds__(128, 4)
void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
                                 const double * __restrict__ chi,
                                 double * __restrict__ fxx,
                                 double * __restrict__ fxy,
                                 double * __restrict__ fxz,
                                 double * __restrict__ fyy,
                                 double * __restrict__ fyz,
                                 double * __restrict__ fzz,
                                 double * __restrict__ chix_out,
                                 double * __restrict__ chiy_out,
                                 double * __restrict__ chiz_out)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
        fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
        fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
        chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
        return;
    }

    const int iF = i0 + 1;
    const int jF = j0 + 1;
    const int kF = k0 + 1;

#if ghost_width != 3
    fd_compute_second6(Lap, iF, jF, kF,
                       iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                       1, 1, 1,
                       fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
    fd_compute_first3(chi, iF, jF, kF,
                      iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                      1, 1, 1,
                      chix_out[tid], chiy_out[tid], chiz_out[tid]);
#else
    if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
        (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
        (kF + 2) <= kmaxF && (kF - 2) >= kminF)
    {
        const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
        fxx[tid] = d_gp.Fdxdx * (
            -fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1)
            +16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
            -30.0 * lap_c
            +16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)
            -       fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1));
        fyy[tid] = d_gp.Fdydy * (
            -fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1)
            +16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
            -30.0 * lap_c
            +16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)
            -       fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1));
        fzz[tid] = d_gp.Fdzdz * (
            -fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1)
            +16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
            -30.0 * lap_c
            +16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)
            -       fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1));

        const double t_jm2 =
            fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1);
        const double t_jm1 =
            fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1);
        const double t_jp1 =
            fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1);
        const double t_jp2 =
            fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1);
        fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);

        const double t_km2_x =
            fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1);
        const double t_km1_x =
            fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1);
        const double t_kp1_x =
            fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1);
        const double t_kp2_x =
            fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1);
        fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);

        const double t_km2_y =
            fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1);
        const double t_km1_y =
            fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1);
        const double t_kp1_y =
            fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1);
        const double t_kp2_y =
            fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1)
          -       fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1);
        fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);

        chix_out[tid] = d_gp.d12dx * (
            fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
          -       fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
        chiy_out[tid] = d_gp.d12dy * (
            fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
          -       fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
        chiz_out[tid] = d_gp.d12dz * (
            fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
          - 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
          + 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
          -       fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
    }
    else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
             (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
             (kF + 1) <= kmaxF && (kF - 1) >= kminF)
    {
        const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
        fxx[tid] = d_gp.Sdxdx * (
            fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
          - 2.0 * lap_c
          + fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1));
        fyy[tid] = d_gp.Sdydy * (
            fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
          - 2.0 * lap_c
          + fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1));
        fzz[tid] = d_gp.Sdzdz * (
            fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
          - 2.0 * lap_c
          + fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1));
        fxy[tid] = d_gp.Sdxdy * (
            fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
          - fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
          - fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
          + fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1));
        fxz[tid] = d_gp.Sdxdz * (
            fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
          - fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
          - fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
          + fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1));
        fyz[tid] = d_gp.Sdydz * (
            fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
          - fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
          - fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
          + fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1));
        chix_out[tid] = d_gp.d2dx * (
            -fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
            +fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
        chiy_out[tid] = d_gp.d2dy * (
            -fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
            +fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
        chiz_out[tid] = d_gp.d2dz * (
            -fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
            +fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
    }
    else {
        fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
        fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
        chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
    }
#endif
}

/* Combined ord=3 advection + KO dissipation.
 * When advection and KO use the same source field, symmetry packing is shared.
 * If they differ (e.g. gxx advection + dxx KO), only KO repacks.
 */
static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
                               double *d_Sfx, double *d_Sfy, double *d_Sfz,
                               double SoA0, double SoA1, double SoA2,
                               double eps_val, int all)
{
#if ghost_width != 3
    gpu_lopsided_kodis_single_batch(d_f_adv, d_f_ko, d_f_rhs,
                                    d_Sfx, d_Sfy, d_Sfz,
                                    SoA0, SoA1, SoA2, eps_val, all);
#else
    double *fh = g_buf.d_fh3;
    const size_t nx = (size_t)g_buf.prev_nx;
    const size_t ny = (size_t)g_buf.prev_ny;
    const size_t nz = (size_t)g_buf.prev_nz;
    const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);

    kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_adv, fh, SoA0, SoA1, SoA2);
    kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);

    if (eps_val > 0.0) {
        if (d_f_ko != d_f_adv) {
            kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_ko, fh, SoA0, SoA1, SoA2);
        }
        kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
    }
#endif
}

__global__ __launch_bounds__(128, 4)
void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
                                 const double * __restrict__ Sfy,
                                 const double * __restrict__ Sfz,
                                 LopsidedKodisTables tables,
                                 double eps_val)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int field = blockIdx.y;
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int SoA0 = tables.soa_signs[3 * field + 0];
    const int SoA1 = tables.soa_signs[3 * field + 1];
    const int SoA2 = tables.soa_signs[3 * field + 2];
    const double *adv_src = tables.adv_fields[field];
    const double *ko_src = tables.ko_fields[field];
    double *rhs = tables.rhs_fields[field];

    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);
    const int iF = i0 + 1;
    const int jF = j0 + 1;
    const int kF = k0 + 1;

#if ghost_width != 3
    if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
        const double val =
            fd_lopsided_axis(adv_src, iF, jF, kF, 0, Sfx[tid], iF, iminF, imaxF,
                             d_gp.dX, SoA0, SoA1, SoA2)
          + fd_lopsided_axis(adv_src, iF, jF, kF, 1, Sfy[tid], jF, jminF, jmaxF,
                             d_gp.dY, SoA0, SoA1, SoA2)
          + fd_lopsided_axis(adv_src, iF, jF, kF, 2, Sfz[tid], kF, kminF, kmaxF,
                             d_gp.dZ, SoA0, SoA1, SoA2);
        rhs[tid] += val;
    }

    rhs[tid] += fd_ko_term(ko_src, iF, jF, kF,
                           iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                           eps_val, SoA0, SoA1, SoA2);
#else
    if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
        double val = 0.0;

        const double sfx = Sfx[tid];
        if (sfx > 0.0) {
            if (i0 <= nx - 4) {
                val += sfx * d_gp.d12dx * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2));
            } else if (i0 <= nx - 3) {
                val += sfx * d_gp.d12dx * (
                    fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                    -       fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
            } else {
                val -= sfx * d_gp.d12dx * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2));
            }
        } else if (sfx < 0.0) {
            if ((i0 - 2) >= iminF) {
                val -= sfx * d_gp.d12dx * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2));
            } else if ((i0 - 1) >= iminF) {
                val += sfx * d_gp.d12dx * (
                    fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                    -       fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
            } else if (i0 >= iminF) {
                val += sfx * d_gp.d12dx * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2));
            }
        }

        const double sfy = Sfy[tid];
        if (sfy > 0.0) {
            if (j0 <= ny - 4) {
                val += sfy * d_gp.d12dy * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2));
            } else if (j0 <= ny - 3) {
                val += sfy * d_gp.d12dy * (
                    fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                    -       fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
            } else {
                val -= sfy * d_gp.d12dy * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2));
            }
        } else if (sfy < 0.0) {
            if ((j0 - 2) >= jminF) {
                val -= sfy * d_gp.d12dy * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2));
            } else if ((j0 - 1) >= jminF) {
                val += sfy * d_gp.d12dy * (
                    fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                    -       fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
            } else if (j0 >= jminF) {
                val += sfy * d_gp.d12dy * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2));
            }
        }

        const double sfz = Sfz[tid];
        if (sfz > 0.0) {
            if (k0 <= nz - 4) {
                val += sfz * d_gp.d12dz * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2));
            } else if (k0 <= nz - 3) {
                val += sfz * d_gp.d12dz * (
                    fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                    -       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
            } else {
                val -= sfz * d_gp.d12dz * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2));
            }
        } else if (sfz < 0.0) {
            if ((k0 - 2) >= kminF) {
                val -= sfz * d_gp.d12dz * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2));
            } else if ((k0 - 1) >= kminF) {
                val += sfz * d_gp.d12dz * (
                    fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
                    - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                    + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                    -       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
            } else if (k0 >= kminF) {
                val += sfz * d_gp.d12dz * (
                    -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                    -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
                    +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
                    - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)
                    +       fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2));
            }
        }

        rhs[tid] += val;
    }

    if (eps_val > 0.0 &&
        (iF - 3) >= iminF && (iF + 3) <= imaxF &&
        (jF - 3) >= jminF && (jF + 3) <= jmaxF &&
        (kF - 3) >= kminF && (kF + 3) <= kmaxF)
    {
        const double cof = 64.0;
        const double Dx =
            (fetch_sym_ord3_direct(ko_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)
           + fetch_sym_ord3_direct(ko_src, iF + 3, jF, kF, SoA0, SoA1, SoA2))
          - 6.0 * (fetch_sym_ord3_direct(ko_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
                 + fetch_sym_ord3_direct(ko_src, iF + 2, jF, kF, SoA0, SoA1, SoA2))
          + 15.0 * (fetch_sym_ord3_direct(ko_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
                  + fetch_sym_ord3_direct(ko_src, iF + 1, jF, kF, SoA0, SoA1, SoA2))
          - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);

        const double Dy =
            (fetch_sym_ord3_direct(ko_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)
           + fetch_sym_ord3_direct(ko_src, iF, jF + 3, kF, SoA0, SoA1, SoA2))
          - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
                 + fetch_sym_ord3_direct(ko_src, iF, jF + 2, kF, SoA0, SoA1, SoA2))
          + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
                  + fetch_sym_ord3_direct(ko_src, iF, jF + 1, kF, SoA0, SoA1, SoA2))
          - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);

        const double Dz =
            (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)
           + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 3, SoA0, SoA1, SoA2))
          - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
                 + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 2, SoA0, SoA1, SoA2))
          + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
                  + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 1, SoA0, SoA1, SoA2))
          - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);

        rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
    }
#endif
}

static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
                                            double *d_Sfx, double *d_Sfy, double *d_Sfz,
                                            double SoA0, double SoA1, double SoA2,
                                            double eps_val, int all)
{
    LopsidedKodisTables tables = {};
    tables.adv_fields[0] = d_f_adv;
    tables.ko_fields[0] = d_f_ko;
    tables.rhs_fields[0] = d_f_rhs;
    tables.soa_signs[0] = (int)SoA0;
    tables.soa_signs[1] = (int)SoA1;
    tables.soa_signs[2] = (int)SoA2;

    dim3 launch_grid((unsigned int)grid((size_t)all), 1u);
    kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
        d_Sfx, d_Sfy, d_Sfz, tables, eps_val);
}

static void gpu_lopsided_kodis_state_batch(double eps_val, int all, bool include_escalar = false)
{
    LopsidedKodisTables tables = {};
    for (int i = 0; i < BSSN_LK_FIELD_COUNT; ++i) {
        tables.adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]];
        tables.ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]];
        tables.rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]];
    }
    std::memcpy(tables.soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs));

    int field_count = BSSN_LK_FIELD_COUNT;
    if (include_escalar) {
        tables.adv_fields[field_count] = g_buf.slot[S_Sphi];
        tables.ko_fields[field_count] = g_buf.slot[S_Sphi];
        tables.rhs_fields[field_count] = g_buf.slot[S_Sphi_rhs];
        tables.soa_signs[3 * field_count + 0] = 1;
        tables.soa_signs[3 * field_count + 1] = 1;
        tables.soa_signs[3 * field_count + 2] = 1;
        ++field_count;

        tables.adv_fields[field_count] = g_buf.slot[S_Spi];
        tables.ko_fields[field_count] = g_buf.slot[S_Spi];
        tables.rhs_fields[field_count] = g_buf.slot[S_Spi_rhs];
        tables.soa_signs[3 * field_count + 0] = 1;
        tables.soa_signs[3 * field_count + 1] = 1;
        tables.soa_signs[3 * field_count + 2] = 1;
        ++field_count;
    }

    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
    kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
        g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
}

__global__ __launch_bounds__(128, 4)
void kern_escalar_sources(
    const double * __restrict__ Sphi,
    const double * __restrict__ Spi,
    const double * __restrict__ alpn1,
    const double * __restrict__ chin1,
    const double * __restrict__ gxx,
    const double * __restrict__ gxy,
    const double * __restrict__ gxz,
    const double * __restrict__ gyy,
    const double * __restrict__ gyz,
    const double * __restrict__ gzz,
    const double * __restrict__ gupxx,
    const double * __restrict__ gupxy,
    const double * __restrict__ gupxz,
    const double * __restrict__ gupyy,
    const double * __restrict__ gupyz,
    const double * __restrict__ gupzz,
    const double * __restrict__ chix,
    const double * __restrict__ chiy,
    const double * __restrict__ chiz,
    const double * __restrict__ Lapx,
    const double * __restrict__ Lapy,
    const double * __restrict__ Lapz,
    const double * __restrict__ trK,
    const double * __restrict__ Gamx,
    const double * __restrict__ Gamy,
    const double * __restrict__ Gamz,
    const double * __restrict__ Kx,
    const double * __restrict__ Ky,
    const double * __restrict__ Kz,
    const double * __restrict__ fxx,
    const double * __restrict__ fxy,
    const double * __restrict__ fxz,
    const double * __restrict__ fyy,
    const double * __restrict__ fyz,
    const double * __restrict__ fzz,
    double * __restrict__ Sphi_rhs,
    double * __restrict__ Spi_rhs,
    double * __restrict__ rho,
    double * __restrict__ Sx,
    double * __restrict__ Sy,
    double * __restrict__ Sz,
    double * __restrict__ Sxx,
    double * __restrict__ Sxy,
    double * __restrict__ Sxz,
    double * __restrict__ Syy,
    double * __restrict__ Syz,
    double * __restrict__ Szz,
    double escalar_a2)
{
    constexpr double PI_V = 3.141592653589793238462643383279502884;
    constexpr double TWO = 2.0;
    constexpr double HALF = 0.5;
    const double A2 = escalar_a2;

    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < d_gp.all;
         i += blockDim.x * gridDim.x) {
        const double c1 = chin1[i];
        const double a = alpn1[i];
        const double sx = Kx[i];
        const double sy = Ky[i];
        const double sz = Kz[i];
        const double sp = Spi[i];

        const double uxx = gupxx[i];
        const double uxy = gupxy[i];
        const double uxz = gupxz[i];
        const double uyy = gupyy[i];
        const double uyz = gupyz[i];
        const double uzz = gupzz[i];

        const double sqpi3 = sqrt(PI_V / 3.0);
        const double e4 = exp(4.0 * sqpi3 * Sphi[i]);
        const double em8 = exp(-8.0 * sqpi3 * Sphi[i]);
        const double V = em8 * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_V * A2);
        const double dV = (1.0 / A2 / 12.0) * sqrt(3.0 / PI_V) * em8 * (-1.0 + e4);

        Sphi_rhs[i] = a * sp;

        double pi_rhs = uxx * fxx[i] + uyy * fyy[i] + uzz * fzz[i]
                      + TWO * (uxy * fxy[i] + uxz * fxz[i] + uyz * fyz[i]);
        pi_rhs -= (Gamx[i] + (uxx * chix[i] + uxy * chiy[i] + uxz * chiz[i]) / (TWO * c1)) * sx
                + (Gamy[i] + (uxy * chix[i] + uyy * chiy[i] + uyz * chiz[i]) / (TWO * c1)) * sy
                + (Gamz[i] + (uxz * chix[i] + uyz * chiy[i] + uzz * chiz[i]) / (TWO * c1)) * sz;
        pi_rhs = pi_rhs * a
               + (uxx * Lapx[i] * sx + uxy * Lapx[i] * sy + uxz * Lapx[i] * sz
               +  uxy * Lapy[i] * sx + uyy * Lapy[i] * sy + uyz * Lapy[i] * sz
               +  uxz * Lapz[i] * sx + uyz * Lapz[i] * sy + uzz * Lapz[i] * sz);
        Spi_rhs[i] = pi_rhs * c1 + a * (trK[i] * sp - dV);

        const double grad = HALF * (uxx * sx * sx + uyy * sy * sy + uzz * sz * sz)
                          + uxy * sx * sy + uxz * sx * sz + uyz * sy * sz;
        const double rho_v = c1 * grad + HALF * sp * sp + V;
        rho[i] = rho_v;
        Sx[i] = -sp * sx;
        Sy[i] = -sp * sy;
        Sz[i] = -sp * sz;
        const double f = (rho_v - sp * sp) / c1;
        Sxx[i] = sx * sx - f * gxx[i];
        Sxy[i] = sx * sy - f * gxy[i];
        Sxz[i] = sx * sz - f * gxz[i];
        Syy[i] = sy * sy - f * gyy[i];
        Syz[i] = sy * sz - f * gyz[i];
        Szz[i] = sz * sz - f * gzz[i];
    }
}

static void gpu_escalar_sources(int all, double escalar_a2)
{
    #define D(s) g_buf.slot[s]
    gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all);
    gpu_fdderivs(D(S_Sphi), D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz),
                 D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz), 1.0, 1.0, 1.0, all);

    kern_escalar_sources<<<grid((size_t)all), BLK>>>(
        D(S_Sphi), D(S_Spi),
        D(S_alpn1), D(S_chin1),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_Lapx), D(S_Lapy), D(S_Lapz),
        D(S_trK), D(S_Gamx), D(S_Gamy), D(S_Gamz),
        D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z),
        D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz),
        D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz),
        D(S_Sphi_rhs), D(S_Spi_rhs),
        D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
        escalar_a2);
    #undef D
}

__global__ __launch_bounds__(128, 3)
void kern_em_rhs_sources(
    const double * __restrict__ chi,
    const double * __restrict__ dxx, const double * __restrict__ dxy, const double * __restrict__ dxz,
    const double * __restrict__ dyy, const double * __restrict__ dyz, const double * __restrict__ dzz,
    const double * __restrict__ Lap,
    const double * __restrict__ betax, const double * __restrict__ betay, const double * __restrict__ betaz,
    const double * __restrict__ trK,
    const double * __restrict__ Ex, const double * __restrict__ Ey, const double * __restrict__ Ez,
    const double * __restrict__ Bx, const double * __restrict__ By, const double * __restrict__ Bz,
    const double * __restrict__ Kpsi, const double * __restrict__ Kphi,
    const double * __restrict__ Jx, const double * __restrict__ Jy, const double * __restrict__ Jz,
    const double * __restrict__ qchar,
    const double * __restrict__ chix, const double * __restrict__ chiy, const double * __restrict__ chiz,
    const double * __restrict__ gxxx, const double * __restrict__ gxyx, const double * __restrict__ gxzx,
    const double * __restrict__ gyyx, const double * __restrict__ gyzx, const double * __restrict__ gzzx,
    const double * __restrict__ gxxy, const double * __restrict__ gxyy, const double * __restrict__ gxzy,
    const double * __restrict__ gyyy, const double * __restrict__ gyzy, const double * __restrict__ gzzy,
    const double * __restrict__ gxxz, const double * __restrict__ gxyz, const double * __restrict__ gxzz,
    const double * __restrict__ gyyz, const double * __restrict__ gyzz, const double * __restrict__ gzzz,
    const double * __restrict__ Lapx, const double * __restrict__ Lapy, const double * __restrict__ Lapz,
    const double * __restrict__ betaxx, const double * __restrict__ betaxy, const double * __restrict__ betaxz,
    const double * __restrict__ betayx, const double * __restrict__ betayy, const double * __restrict__ betayz,
    const double * __restrict__ betazx, const double * __restrict__ betazy, const double * __restrict__ betazz,
    const double * __restrict__ Kpsix, const double * __restrict__ Kpsiy, const double * __restrict__ Kpsiz,
    const double * __restrict__ Kphix, const double * __restrict__ Kphiy, const double * __restrict__ Kphiz,
    const double * __restrict__ Exx, const double * __restrict__ Exy, const double * __restrict__ Exz,
    const double * __restrict__ Eyx, const double * __restrict__ Eyy, const double * __restrict__ Eyz,
    const double * __restrict__ Ezx, const double * __restrict__ Ezy, const double * __restrict__ Ezz,
    const double * __restrict__ Bxx, const double * __restrict__ Bxy, const double * __restrict__ Bxz,
    const double * __restrict__ Byx, const double * __restrict__ Byy, const double * __restrict__ Byz,
    const double * __restrict__ Bzx, const double * __restrict__ Bzy, const double * __restrict__ Bzz,
    double * __restrict__ Kpsi_rhs, double * __restrict__ Kphi_rhs,
    double * __restrict__ Ex_rhs, double * __restrict__ Ey_rhs, double * __restrict__ Ez_rhs,
    double * __restrict__ Bx_rhs, double * __restrict__ By_rhs, double * __restrict__ Bz_rhs,
    double * __restrict__ rho, double * __restrict__ Sx, double * __restrict__ Sy, double * __restrict__ Sz,
    double * __restrict__ Sxx, double * __restrict__ Sxy, double * __restrict__ Sxz,
    double * __restrict__ Syy, double * __restrict__ Syz, double * __restrict__ Szz)
{
    const int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= d_gp.all) return;

    constexpr double ONE = 1.0, TWO = 2.0, FOUR = 4.0, F3o2 = 1.5, EIT = 8.0, kappa = 1.0;
    constexpr double PI = 3.141592653589793238462643383279502884;
    const double alpn1 = Lap[i] + ONE;
    const double chin1 = chi[i] + ONE;
    const double sqc = sqrt(chin1);
    const double chi3o2 = sqc * sqc * sqc;
    const double gxxv = (dxx[i] + ONE) / chin1;
    const double gxyv = dxy[i] / chin1;
    const double gxzv = dxz[i] / chin1;
    const double gyyv = (dyy[i] + ONE) / chin1;
    const double gyzv = dyz[i] / chin1;
    const double gzzv = (dzz[i] + ONE) / chin1;
    const double gxxxv = (gxxx[i] - gxxv * chix[i]) / chin1;
    const double gxxyv = (gxxy[i] - gxxv * chiy[i]) / chin1;
    const double gxxzv = (gxxz[i] - gxxv * chiz[i]) / chin1;
    const double gxyxv = (gxyx[i] - gxyv * chix[i]) / chin1;
    const double gxyyv = (gxyy[i] - gxyv * chiy[i]) / chin1;
    const double gxyzv = (gxyz[i] - gxyv * chiz[i]) / chin1;
    const double gxzxv = (gxzx[i] - gxzv * chix[i]) / chin1;
    const double gxzyv = (gxzy[i] - gxzv * chiy[i]) / chin1;
    const double gxzzv = (gxzz[i] - gxzv * chiz[i]) / chin1;
    const double gyyxv = (gyyx[i] - gyyv * chix[i]) / chin1;
    const double gyyyv = (gyyy[i] - gyyv * chiy[i]) / chin1;
    const double gyyzv = (gyyz[i] - gyyv * chiz[i]) / chin1;
    const double gyzxv = (gyzx[i] - gyzv * chix[i]) / chin1;
    const double gyzyv = (gyzy[i] - gyzv * chiy[i]) / chin1;
    const double gyzzv = (gyzz[i] - gyzv * chiz[i]) / chin1;
    const double gzzxv = (gzzx[i] - gzzv * chix[i]) / chin1;
    const double gzzyv = (gzzy[i] - gzzv * chiy[i]) / chin1;
    const double gzzzv = (gzzz[i] - gzzv * chiz[i]) / chin1;
    const double det = gxxv*gyyv*gzzv + gxyv*gyzv*gxzv + gxzv*gxyv*gyzv - gxzv*gyyv*gxzv - gxyv*gxyv*gzzv - gxxv*gyzv*gyzv;
    const double gupxx = (gyyv*gzzv - gyzv*gyzv) / det;
    const double gupxy = -(gxyv*gzzv - gyzv*gxzv) / det;
    const double gupxz = (gxyv*gyzv - gyyv*gxzv) / det;
    const double gupyy = (gxxv*gzzv - gxzv*gxzv) / det;
    const double gupyz = -(gxxv*gyzv - gxyv*gxzv) / det;
    const double gupzz = (gxxv*gyyv - gxyv*gxyv) / det;

    Ex_rhs[i] = alpn1*trK[i]*Ex[i]-(Ex[i]*betaxx[i]+Ey[i]*betaxy[i]+Ez[i]*betaxz[i])-FOUR*PI*alpn1*Jx[i]-alpn1*(gupxx*Kpsix[i]+gupxy*Kpsiy[i]+gupxz*Kpsiz[i])
        + chi3o2*(((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapy[i]+alpn1*(gxzv*Bxy[i]+gyzv*Byy[i]+gzzv*Bzy[i])+alpn1*(Bx[i]*gxzyv+By[i]*gyzyv+Bz[i]*gzzyv))-((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapz[i]+alpn1*(gxyv*Bxz[i]+gyyv*Byz[i]+gyzv*Bzz[i])+alpn1*(Bx[i]*gxyzv+By[i]*gyyzv+Bz[i]*gyzzv)));
    Ey_rhs[i] = alpn1*trK[i]*Ey[i]-(Ex[i]*betayx[i]+Ey[i]*betayy[i]+Ez[i]*betayz[i])-FOUR*PI*alpn1*Jy[i]-alpn1*(gupxy*Kpsix[i]+gupyy*Kpsiy[i]+gupyz*Kpsiz[i])
        + chi3o2*(((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapz[i]+alpn1*(gxxv*Bxz[i]+gxyv*Byz[i]+gxzv*Bzz[i])+alpn1*(Bx[i]*gxxzv+By[i]*gxyzv+Bz[i]*gxzzv))-((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapx[i]+alpn1*(gxzv*Bxx[i]+gyzv*Byx[i]+gzzv*Bzx[i])+alpn1*(Bx[i]*gxzxv+By[i]*gyzxv+Bz[i]*gzzxv)));
    Ez_rhs[i] = alpn1*trK[i]*Ez[i]-(Ex[i]*betazx[i]+Ey[i]*betazy[i]+Ez[i]*betazz[i])-FOUR*PI*alpn1*Jz[i]-alpn1*(gupxz*Kpsix[i]+gupyz*Kpsiy[i]+gupzz*Kpsiz[i])
        + chi3o2*(((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapx[i]+alpn1*(gxyv*Bxx[i]+gyyv*Byx[i]+gyzv*Bzx[i])+alpn1*(Bx[i]*gxyxv+By[i]*gyyxv+Bz[i]*gyzxv))-((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapy[i]+alpn1*(gxxv*Bxy[i]+gxyv*Byy[i]+gxzv*Bzy[i])+alpn1*(Bx[i]*gxxyv+By[i]*gxyyv+Bz[i]*gxzyv)));
    Bx_rhs[i] = alpn1*trK[i]*Bx[i]-(Bx[i]*betaxx[i]+By[i]*betaxy[i]+Bz[i]*betaxz[i])-alpn1*(gupxx*Kphix[i]+gupxy*Kphiy[i]+gupxz*Kphiz[i])
        - chi3o2*(((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapy[i]+alpn1*(gxzv*Exy[i]+gyzv*Eyy[i]+gzzv*Ezy[i])+alpn1*(Ex[i]*gxzyv+Ey[i]*gyzyv+Ez[i]*gzzyv))-((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapz[i]+alpn1*(gxyv*Exz[i]+gyyv*Eyz[i]+gyzv*Ezz[i])+alpn1*(Ex[i]*gxyzv+Ey[i]*gyyzv+Ez[i]*gyzzv)));
    By_rhs[i] = alpn1*trK[i]*By[i]-(Bx[i]*betayx[i]+By[i]*betayy[i]+Bz[i]*betayz[i])-alpn1*(gupxy*Kphix[i]+gupyy*Kphiy[i]+gupyz*Kphiz[i])
        - chi3o2*(((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapz[i]+alpn1*(gxxv*Exz[i]+gxyv*Eyz[i]+gxzv*Ezz[i])+alpn1*(Ex[i]*gxxzv+Ey[i]*gxyzv+Ez[i]*gxzzv))-((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapx[i]+alpn1*(gxzv*Exx[i]+gyzv*Eyx[i]+gzzv*Ezx[i])+alpn1*(Ex[i]*gxzxv+Ey[i]*gyzxv+Ez[i]*gzzxv)));
    Bz_rhs[i] = alpn1*trK[i]*Bz[i]-(Bx[i]*betazx[i]+By[i]*betazy[i]+Bz[i]*betazz[i])-alpn1*(gupxz*Kphix[i]+gupyz*Kphiy[i]+gupzz*Kphiz[i])
        - chi3o2*(((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapx[i]+alpn1*(gxyv*Exx[i]+gyyv*Eyx[i]+gyzv*Ezx[i])+alpn1*(Ex[i]*gxyxv+Ey[i]*gyyxv+Ez[i]*gyzxv))-((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapy[i]+alpn1*(gxxv*Exy[i]+gxyv*Eyy[i]+gxzv*Ezy[i])+alpn1*(Ex[i]*gxxyv+Ey[i]*gxyyv+Ez[i]*gxzyv)));
    Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i]-alpn1*kappa*Kpsi[i]-alpn1*(Exx[i]+Eyy[i]+Ezz[i]-F3o2/chin1*(chix[i]*Ex[i]+chiy[i]*Ey[i]+chiz[i]*Ez[i]));
    Kphi_rhs[i] = -alpn1*kappa*Kphi[i]-alpn1*(Bxx[i]+Byy[i]+Bzz[i]-F3o2/chin1*(chix[i]*Bx[i]+chiy[i]*By[i]+chiz[i]*Bz[i]));

    const double lrho = (gxxv*(Ex[i]*Ex[i]+Bx[i]*Bx[i])+gyyv*(Ey[i]*Ey[i]+By[i]*By[i])+gzzv*(Ez[i]*Ez[i]+Bz[i]*Bz[i])+TWO*(gxyv*(Ex[i]*Ey[i]+Bx[i]*By[i])+gxzv*(Ex[i]*Ez[i]+Bx[i]*Bz[i])+gyzv*(Ey[i]*Ez[i]+By[i]*Bz[i])))/EIT/PI;
    rho[i] = lrho;
    Sx[i] = (Ey[i]*Bz[i]-Ez[i]*By[i])/FOUR/PI/chi3o2;
    Sy[i] = (Ez[i]*Bx[i]-Ex[i]*Bz[i])/FOUR/PI/chi3o2;
    Sz[i] = (Ex[i]*By[i]-Ey[i]*Bx[i])/FOUR/PI/chi3o2;
    const double lEx = gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i], lEy = gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i], lEz = gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i];
    const double lBx = gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i], lBy = gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i], lBz = gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i];
    Sxx[i] = lrho*gxxv-(lEx*lEx+lBx*lBx)/FOUR/PI;
    Sxy[i] = lrho*gxyv-(lEx*lEy+lBx*lBy)/FOUR/PI;
    Sxz[i] = lrho*gxzv-(lEx*lEz+lBx*lBz)/FOUR/PI;
    Syy[i] = lrho*gyyv-(lEy*lEy+lBy*lBy)/FOUR/PI;
    Syz[i] = lrho*gyzv-(lEy*lEz+lBy*lBz)/FOUR/PI;
    Szz[i] = lrho*gzzv-(lEz*lEz+lBz*lBz)/FOUR/PI;
}

static void gpu_lopsided_kodis_em_batch(double eps_val, int all)
{
    LopsidedKodisTables tables = {};
    const int adv_slots[8] = {S_EM_Kpsi, S_EM_Kphi, S_EM_Ex, S_EM_Ey,
                              S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz};
    const int rhs_slots[8] = {S_EM_Kpsi_rhs, S_EM_Kphi_rhs, S_EM_Ex_rhs, S_EM_Ey_rhs,
                              S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs};
    const int signs[24] = {
        1, 1, 1,   1, 1, 1,
       -1, 1, 1,   1,-1, 1,   1, 1,-1,
        1,-1,-1,  -1, 1,-1,  -1,-1, 1
    };
    for (int i = 0; i < 8; ++i) {
        tables.adv_fields[i] = g_buf.slot[adv_slots[i]];
        tables.ko_fields[i] = g_buf.slot[adv_slots[i]];
        tables.rhs_fields[i] = g_buf.slot[rhs_slots[i]];
        tables.soa_signs[3*i+0] = signs[3*i+0];
        tables.soa_signs[3*i+1] = signs[3*i+1];
        tables.soa_signs[3*i+2] = signs[3*i+2];
    }
    dim3 launch_grid((unsigned int)grid((size_t)all), 8u);
    kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
        g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
}

static void gpu_em_rhs_sources(int all, double eps)
{
#define D(s) g_buf.slot[s]
    double *src[] = {D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
                     D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz),
                     D(S_EM_Kpsi), D(S_EM_Kphi),
                     D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz)};
    double *fx[] = {D(S_Lapx), D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
                    D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
                    D(S_EM_Kpsix), D(S_EM_Kphix),
                    D(S_EM_Exx), D(S_EM_Eyx), D(S_EM_Ezx), D(S_EM_Bxx), D(S_EM_Byx), D(S_EM_Bzx)};
    double *fy[] = {D(S_Lapy), D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
                    D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
                    D(S_EM_Kpsiy), D(S_EM_Kphiy),
                    D(S_EM_Exy), D(S_EM_Eyy), D(S_EM_Ezy), D(S_EM_Bxy), D(S_EM_Byy), D(S_EM_Bzy)};
    double *fz[] = {D(S_Lapz), D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
                    D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
                    D(S_EM_Kpsiz), D(S_EM_Kphiz),
                    D(S_EM_Exz), D(S_EM_Eyz), D(S_EM_Ezz), D(S_EM_Bxz), D(S_EM_Byz), D(S_EM_Bzz)};
    const int soa[] = {
        1,1,1,  -1,1,1,  1,-1,1,  1,1,-1,  1,1,1,
        1,1,1,  -1,-1,1, -1,1,-1, 1,1,1, 1,-1,-1, 1,1,1,
        1,1,1,  1,1,1,
        -1,1,1, 1,-1,1, 1,1,-1, 1,-1,-1, -1,1,-1, -1,-1,1
    };
    gpu_fderivs_batch(19, src, fx, fy, fz, soa, all);
    kern_em_rhs_sources<<<grid((size_t)all), BLK>>>(
        D(S_chi), D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz),
        D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_trK),
        D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz),
        D(S_EM_Kpsi), D(S_EM_Kphi), D(S_EM_Jx), D(S_EM_Jy), D(S_EM_Jz), D(S_EM_qchar),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
        D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
        D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
        D(S_Lapx), D(S_Lapy), D(S_Lapz),
        D(S_betaxx), D(S_betaxy), D(S_betaxz), D(S_betayx), D(S_betayy), D(S_betayz),
        D(S_betazx), D(S_betazy), D(S_betazz),
        D(S_EM_Kpsix), D(S_EM_Kpsiy), D(S_EM_Kpsiz), D(S_EM_Kphix), D(S_EM_Kphiy), D(S_EM_Kphiz),
        D(S_EM_Exx), D(S_EM_Exy), D(S_EM_Exz), D(S_EM_Eyx), D(S_EM_Eyy), D(S_EM_Eyz),
        D(S_EM_Ezx), D(S_EM_Ezy), D(S_EM_Ezz), D(S_EM_Bxx), D(S_EM_Bxy), D(S_EM_Bxz),
        D(S_EM_Byx), D(S_EM_Byy), D(S_EM_Byz), D(S_EM_Bzx), D(S_EM_Bzy), D(S_EM_Bzz),
        D(S_EM_Kpsi_rhs), D(S_EM_Kphi_rhs), D(S_EM_Ex_rhs), D(S_EM_Ey_rhs), D(S_EM_Ez_rhs),
        D(S_EM_Bx_rhs), D(S_EM_By_rhs), D(S_EM_Bz_rhs),
        D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
    gpu_lopsided_kodis_em_batch(eps, all);
#undef D
}

__global__ void kern_rk4_finalize(const double * __restrict__ f0,
                                  double * __restrict__ frhs,
                                  double * __restrict__ accum,
                                  double dT,
                                  int rk4_stage)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < d_gp.all;
         i += blockDim.x * gridDim.x)
    {
        const double rhs = frhs[i];
        switch (rk4_stage) {
        case 0:
            accum[i] = rhs;
            frhs[i] = f0[i] + 0.5 * dT * rhs;
            break;
        case 1:
            accum[i] += 2.0 * rhs;
            frhs[i] = f0[i] + 0.5 * dT * rhs;
            break;
        case 2:
            accum[i] += 2.0 * rhs;
            frhs[i] = f0[i] + dT * rhs;
            break;
        default:
            frhs[i] = f0[i] + (dT / 6.0) * (accum[i] + rhs);
            break;
        }
    }
}

__global__ __launch_bounds__(128, 4)
void kern_rk4_finalize_batched(Rk4FinalizeTables tables,
                               double dT,
                               int rk4_stage,
                               double chitiny)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int field = blockIdx.y;
    const double *f0 = tables.f0_fields[field];
    double *frhs = tables.rhs_fields[field];
    double *accum = tables.accum_fields[field];

    const double rhs = frhs[tid];
    switch (rk4_stage) {
    case 0:
        accum[tid] = rhs;
        frhs[tid] = f0[tid] + 0.5 * dT * rhs;
        break;
    case 1:
        accum[tid] += 2.0 * rhs;
        frhs[tid] = f0[tid] + 0.5 * dT * rhs;
        break;
    case 2:
        accum[tid] += 2.0 * rhs;
        frhs[tid] = f0[tid] + dT * rhs;
        break;
    default:
        frhs[tid] = f0[tid] + (dT / 6.0) * (accum[tid] + rhs);
        break;
    }

    if (field == 0 && frhs[tid] < chitiny) frhs[tid] = chitiny;
}

static void gpu_rk4_finalize_batch(const StepContext &ctx,
                                   size_t all,
                                   double dT,
                                   int rk4_stage,
                                   double chitiny)
{
    Rk4FinalizeTables tables = {};
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        tables.f0_fields[i] = ctx.d_state0[i];
        tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
        tables.accum_fields[i] = ctx.d_accum[i];
    }

    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
    kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
}

static void gpu_escalar_rk4_finalize_batch(const StepContext &ctx,
                                           size_t all,
                                           double dT,
                                           int rk4_stage,
                                           double chitiny)
{
    Rk4FinalizeTables tables = {};
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        tables.f0_fields[i] = ctx.d_state0[i];
        tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
        tables.accum_fields[i] = ctx.d_accum[i];
    }

    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
    kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
    kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[24], g_buf.slot[S_Sphi_rhs],
                                          ctx.d_accum[24], dT, rk4_stage);
    kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[25], g_buf.slot[S_Spi_rhs],
                                          ctx.d_accum[25], dT, rk4_stage);
}

static void gpu_em_rk4_finalize_batch(const StepContext &ctx,
                                      size_t all,
                                      double dT,
                                      int rk4_stage,
                                      double chitiny)
{
    Rk4FinalizeTables tables = {};
    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
        tables.f0_fields[i] = ctx.d_state0[i];
        tables.rhs_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]];
        tables.accum_fields[i] = ctx.d_accum[i];
    }

    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_EM_STATE_COUNT);
    kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
}

__global__ __launch_bounds__(128, 4)
void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
                                      int touch_xmin, int touch_xmax,
                                      int touch_ymin, int touch_ymax,
                                      int touch_zmin, int touch_zmax)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int nx = d_gp.ex[0];
    const int ny = d_gp.ex[1];
    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    const bool on_boundary =
        (touch_xmin && i0 == 0) ||
        (touch_xmax && i0 == nx - 1) ||
        (touch_ymin && j0 == 0) ||
        (touch_ymax && j0 == ny - 1) ||
        (touch_zmin && k0 == 0) ||
        (touch_zmax && k0 == d_gp.ex[2] - 1);
    if (!on_boundary) return;

    const int field = blockIdx.y;
    tables.dst_fields[field][tid] = tables.src_fields[field][tid];
}

static void gpu_copy_patch_boundary_batch(int all,
                                          int touch_xmin, int touch_xmax,
                                          int touch_ymin, int touch_ymax,
                                          int touch_zmin, int touch_zmax)
{
    if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax))
        return;

    PatchBoundaryTables tables = {};
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        tables.src_fields[i] = g_buf.slot[k_state_input_slots[i]];
        tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
    }

    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT);
    kern_copy_patch_boundary_batched<<<launch_grid, BLK>>>(
        tables,
        touch_xmin, touch_xmax,
        touch_ymin, touch_ymax,
        touch_zmin, touch_zmax);
}

__global__ __launch_bounds__(128, 4)
void kern_escalar_restore_patch_boundary_batched(EScalarBoundaryTables tables,
                                                 int touch_xmin, int touch_xmax,
                                                 int touch_ymin, int touch_ymax,
                                                 int touch_zmin, int touch_zmax)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int nx = d_gp.ex[0];
    const int ny = d_gp.ex[1];
    const int nz = d_gp.ex[2];
    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    const bool on_boundary =
        (touch_xmin && i0 == 0) ||
        (touch_xmax && i0 == nx - 1) ||
        (touch_ymin && j0 == 0) ||
        (touch_ymax && j0 == ny - 1) ||
        (touch_zmin && k0 == 0) ||
        (touch_zmax && k0 == nz - 1);
    if (!on_boundary) return;

    const int field = blockIdx.y;
    tables.out_fields[field][tid] = tables.f0_fields[field][tid];
}

static void gpu_escalar_restore_patch_boundary_batch(const StepContext &ctx,
                                                     int all,
                                                     int touch_xmin, int touch_xmax,
                                                     int touch_ymin, int touch_ymax,
                                                     int touch_zmin, int touch_zmax)
{
    if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax))
        return;

    EScalarBoundaryTables tables = {};
    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
        tables.f0_fields[i] = ctx.d_state0[i];
        tables.out_fields[i] = g_buf.slot[k_escalar_state_rhs_slots[i]];
    }

    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_ESCALAR_STATE_COUNT);
    kern_escalar_restore_patch_boundary_batched<<<launch_grid, BLK>>>(
        tables,
        touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
}

static void gpu_em_restore_patch_boundary_batch(const StepContext &ctx,
                                                int all,
                                                int touch_xmin, int touch_xmax,
                                                int touch_ymin, int touch_ymax,
                                                int touch_zmin, int touch_zmax)
{
    if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax))
        return;

    EScalarBoundaryTables tables = {};
    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
        tables.f0_fields[i] = ctx.d_state0[i];
        tables.out_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]];
    }

    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_EM_STATE_COUNT);
    kern_escalar_restore_patch_boundary_batched<<<launch_grid, BLK>>>(
        tables,
        touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
}

__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
                                     double * __restrict__ gxy,
                                     double * __restrict__ gxz,
                                     double * __restrict__ dyy,
                                     double * __restrict__ gyz,
                                     double * __restrict__ dzz,
                                     double * __restrict__ Axx,
                                     double * __restrict__ Axy,
                                     double * __restrict__ Axz,
                                     double * __restrict__ Ayy,
                                     double * __restrict__ Ayz,
                                     double * __restrict__ Azz)
{
    constexpr double F1O3 = 1.0 / 3.0;
    constexpr double ONE = 1.0;
    constexpr double TWO = 2.0;

    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < d_gp.all;
         i += blockDim.x * gridDim.x)
    {
        double lgxx = dxx[i] + ONE;
        double lgyy = dyy[i] + ONE;
        double lgzz = dzz[i] + ONE;
        double lgxy = gxy[i];
        double lgxz = gxz[i];
        double lgyz = gyz[i];

        double lscale = lgxx * lgyy * lgzz
                      + lgxy * lgyz * lgxz
                      + lgxz * lgxy * lgyz
                      - lgxz * lgyy * lgxz
                      - lgxy * lgxy * lgzz
                      - lgxx * lgyz * lgyz;

        lscale = ONE / cbrt(lscale);

        lgxx *= lscale;
        lgxy *= lscale;
        lgxz *= lscale;
        lgyy *= lscale;
        lgyz *= lscale;
        lgzz *= lscale;

        dxx[i] = lgxx - ONE;
        gxy[i] = lgxy;
        gxz[i] = lgxz;
        dyy[i] = lgyy - ONE;
        gyz[i] = lgyz;
        dzz[i] = lgzz - ONE;

        const double lgupxx =   (lgyy * lgzz - lgyz * lgyz);
        const double lgupxy = - (lgxy * lgzz - lgyz * lgxz);
        const double lgupxz =   (lgxy * lgyz - lgyy * lgxz);
        const double lgupyy =   (lgxx * lgzz - lgxz * lgxz);
        const double lgupyz = - (lgxx * lgyz - lgxy * lgxz);
        const double lgupzz =   (lgxx * lgyy - lgxy * lgxy);

        const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i]
                          + TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]);

        Axx[i] -= F1O3 * lgxx * ltrA;
        Axy[i] -= F1O3 * lgxy * ltrA;
        Axz[i] -= F1O3 * lgxz * ltrA;
        Ayy[i] -= F1O3 * lgyy * ltrA;
        Ayz[i] -= F1O3 * lgyz * ltrA;
        Azz[i] -= F1O3 * lgzz * ltrA;
    }
}

__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < d_gp.all;
         i += blockDim.x * gridDim.x)
    {
        if (chi[i] < tinny) chi[i] = tinny;
    }
}

enum SommerFace {
    FACE_XMAX = 0,
    FACE_YMAX = 1,
    FACE_ZMAX = 2,
    FACE_XMIN = 3,
    FACE_YMIN = 4,
    FACE_ZMIN = 5
};

__global__ void kern_sommerfeld_face_bam(const double * __restrict__ fh,
                                         double * __restrict__ f_rhs,
                                         int face,
                                         double velocity,
                                         double x0,
                                         double y0,
                                         double z0)
{
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imin = (d_gp.Symmetry > 1 && fabs(x0) < d_gp.dX) ? 0 : 1;
    const int jmin = (d_gp.Symmetry > 1 && fabs(y0) < d_gp.dY) ? 0 : 1;
    const int kmin = (d_gp.Symmetry > 0 && fabs(z0) < d_gp.dZ) ? 0 : 1;
    const int imax = nx, jmax = ny, kmax = nz;
    const int plane_count =
        (face == FACE_XMAX || face == FACE_XMIN) ? ny * nz :
        (face == FACE_YMAX || face == FACE_YMIN) ? nx * nz :
                                                   nx * ny;

    for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
         tid < plane_count;
         tid += blockDim.x * gridDim.x)
    {
        int i0 = 0, j0 = 0, k0 = 0;
        if (face == FACE_XMAX || face == FACE_XMIN) {
            j0 = tid % ny;
            k0 = tid / ny;
            i0 = (face == FACE_XMAX) ? (nx - 1) : 0;
        } else if (face == FACE_YMAX || face == FACE_YMIN) {
            i0 = tid % nx;
            k0 = tid / nx;
            j0 = (face == FACE_YMAX) ? (ny - 1) : 0;
        } else {
            i0 = tid % nx;
            j0 = tid / nx;
            k0 = (face == FACE_ZMAX) ? (nz - 1) : 0;
        }

        const int iF = i0 + 1;
        const int jF = j0 + 1;
        const int kF = k0 + 1;
        const int p = idx_ex_d(i0, j0, k0);

        const double x = x0 + i0 * d_gp.dX;
        const double y = y0 + j0 * d_gp.dY;
        const double z = z0 + k0 * d_gp.dZ;
        const double r = sqrt(x * x + y * y + z * z);
        if (r == 0.0) continue;

        const double wx = velocity * x / r;
        const double wy = velocity * y / r;
        const double wz = velocity * z / r;

        double fx = 0.0, fy = 0.0, fz = 0.0;
        if (wx > 0.0) {
            if (iF - 2 >= imin) fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
            else if (iF - 1 >= imin) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
            else fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
        } else if (wx < 0.0) {
            if (iF + 2 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
            else if (iF + 1 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
            else fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
        }

        if (wy > 0.0) {
            if (jF - 2 >= jmin) fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
            else if (jF - 1 >= jmin) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
            else fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
        } else if (wy < 0.0) {
            if (jF + 2 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
            else if (jF + 1 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
            else fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
        }

        if (wz > 0.0) {
            if (kF - 2 >= kmin) fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
            else if (kF - 1 >= kmin) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
            else fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
        } else if (wz < 0.0) {
            if (kF + 2 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
            else if (kF + 1 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
            else fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
        }

        f_rhs[p] = -velocity * (fx * x + fy * y + fz * z + fh[idx_fh2(iF, jF, kF)]) / r;
    }
}

static void gpu_sommerfeld_routbam(double *d_f0, double *d_f_rhs,
                                   double velocity,
                                   double SoA0, double SoA1, double SoA2,
                                   double *X, double *Y, double *Z,
                                   const double *bbox,
                                   int Symmetry)
{
    if (velocity == 0.0) return;
    const int nx = g_buf.prev_nx;
    const int ny = g_buf.prev_ny;
    const int nz = g_buf.prev_nz;
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const bool touch_xmax = fabs(X[nx - 1] - bbox[3]) < dX;
    const bool touch_ymax = fabs(Y[ny - 1] - bbox[4]) < dY;
    const bool touch_zmax = fabs(Z[nz - 1] - bbox[5]) < dZ;
    const bool touch_xmin = fabs(X[0] - bbox[0]) < dX &&
                            !(Symmetry == 2 && fabs(bbox[0]) < dX * 0.5);
    const bool touch_ymin = fabs(Y[0] - bbox[1]) < dY &&
                            !(Symmetry == 2 && fabs(bbox[1]) < dY * 0.5);
    const bool touch_zmin = fabs(Z[0] - bbox[2]) < dZ &&
                            !(Symmetry > 0 && fabs(bbox[2]) < dZ * 0.5);

    const size_t w_pack = (size_t)(nx + 2) * (ny + 2) * (nz + 2);
    kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f0, g_buf.d_fh2, SoA0, SoA1, SoA2);
    if (touch_xmax) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMAX, velocity, X[0], Y[0], Z[0]);
    if (touch_ymax) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMAX, velocity, X[0], Y[0], Z[0]);
    if (touch_zmax) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMAX, velocity, X[0], Y[0], Z[0]);
    if (touch_xmin) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMIN, velocity, X[0], Y[0], Z[0]);
    if (touch_ymin) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMIN, velocity, X[0], Y[0], Z[0]);
    if (touch_zmin) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMIN, velocity, X[0], Y[0], Z[0]);
}

/* ================================================================== */
/*  C. Point-wise computation kernels                                  */
/* ================================================================== */

/* Phase 1: alpn1, chin1, gxx=dxx+1, gyy=dyy+1, gzz=dzz+1 */
__global__ void kern_phase1_prep(
    const double* __restrict__ Lap, const double* __restrict__ chi,
    const double* __restrict__ dxx, const double* __restrict__ dyy,
    const double* __restrict__ dzz,
    double* __restrict__ alpn1, double* __restrict__ chin1,
    double* __restrict__ gxx, double* __restrict__ gyy, double* __restrict__ gzz)
{
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        alpn1[i] = Lap[i] + 1.0;
        chin1[i] = chi[i] + 1.0;
        gxx[i] = dxx[i] + 1.0;
        gyy[i] = dyy[i] + 1.0;
        gzz[i] = dzz[i] + 1.0;
    }
}

/* Phase 2a: chi_rhs, gij_rhs */
__global__ void kern_phase2_metric_rhs(
    const double* __restrict__ alpn1, const double* __restrict__ chin1,
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ trK,
    const double* __restrict__ Axx, const double* __restrict__ Axy,
    const double* __restrict__ Axz, const double* __restrict__ Ayy,
    const double* __restrict__ Ayz, const double* __restrict__ Azz,
    const double* __restrict__ betaxx, const double* __restrict__ betaxy,
    const double* __restrict__ betaxz, const double* __restrict__ betayx,
    const double* __restrict__ betayy, const double* __restrict__ betayz,
    const double* __restrict__ betazx, const double* __restrict__ betazy,
    const double* __restrict__ betazz,
    double* __restrict__ chi_rhs, double* __restrict__ gxx_rhs,
    double* __restrict__ gyy_rhs, double* __restrict__ gzz_rhs,
    double* __restrict__ gxy_rhs, double* __restrict__ gyz_rhs,
    double* __restrict__ gxz_rhs)
{
    const double F2o3 = 2.0/3.0, F1o3 = 1.0/3.0, TWO = 2.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double db = betaxx[i] + betayy[i] + betazz[i];
        chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - db);
        gxx_rhs[i] = -TWO*alpn1[i]*Axx[i] - F2o3*gxx[i]*db
            + TWO*(gxx[i]*betaxx[i] + gxy[i]*betayx[i] + gxz[i]*betazx[i]);
        gyy_rhs[i] = -TWO*alpn1[i]*Ayy[i] - F2o3*gyy[i]*db
            + TWO*(gxy[i]*betaxy[i] + gyy[i]*betayy[i] + gyz[i]*betazy[i]);
        gzz_rhs[i] = -TWO*alpn1[i]*Azz[i] - F2o3*gzz[i]*db
            + TWO*(gxz[i]*betaxz[i] + gyz[i]*betayz[i] + gzz[i]*betazz[i]);
        gxy_rhs[i] = -TWO*alpn1[i]*Axy[i] + F1o3*gxy[i]*db
            + gxx[i]*betaxy[i] + gxz[i]*betazy[i] + gyy[i]*betayx[i]
            + gyz[i]*betazx[i] - gxy[i]*betazz[i];
        gyz_rhs[i] = -TWO*alpn1[i]*Ayz[i] + F1o3*gyz[i]*db
            + gxy[i]*betaxz[i] + gyy[i]*betayz[i] + gxz[i]*betaxy[i]
            + gzz[i]*betazy[i] - gyz[i]*betaxx[i];
        gxz_rhs[i] = -TWO*alpn1[i]*Axz[i] + F1o3*gxz[i]*db
            + gxx[i]*betaxz[i] + gxy[i]*betayz[i] + gyz[i]*betayx[i]
            + gzz[i]*betazx[i] - gxz[i]*betayy[i];
    }
}

/* Phase 2b: metric inverse */
__global__ void kern_phase2_inverse(
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    double* __restrict__ gupxx, double* __restrict__ gupxy,
    double* __restrict__ gupxz, double* __restrict__ gupyy,
    double* __restrict__ gupyz, double* __restrict__ gupzz)
{
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double det = gxx[i]*gyy[i]*gzz[i] + gxy[i]*gyz[i]*gxz[i] + gxz[i]*gxy[i]*gyz[i]
                   - gxz[i]*gyy[i]*gxz[i] - gxy[i]*gxy[i]*gzz[i] - gxx[i]*gyz[i]*gyz[i];
        double inv = 1.0 / det;
        gupxx[i] = (gyy[i]*gzz[i] - gyz[i]*gyz[i]) * inv;
        gupxy[i] = -(gxy[i]*gzz[i] - gyz[i]*gxz[i]) * inv;
        gupxz[i] = (gxy[i]*gyz[i] - gyy[i]*gxz[i]) * inv;
        gupyy[i] = (gxx[i]*gzz[i] - gxz[i]*gxz[i]) * inv;
        gupyz[i] = -(gxx[i]*gyz[i] - gxy[i]*gxz[i]) * inv;
        gupzz[i] = (gxx[i]*gyy[i] - gxy[i]*gxy[i]) * inv;
    }
}

/* Phase 3: Gamma constraint residuals (co==0 only) */
__global__ void kern_phase3_gamma_constraint(
    const double* __restrict__ Gamx, const double* __restrict__ Gamy,
    const double* __restrict__ Gamz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ gxxx, const double* __restrict__ gxyx,
    const double* __restrict__ gxzx, const double* __restrict__ gyyx,
    const double* __restrict__ gyzx, const double* __restrict__ gzzx,
    const double* __restrict__ gxxy, const double* __restrict__ gxyy,
    const double* __restrict__ gxzy, const double* __restrict__ gyyy,
    const double* __restrict__ gyzy, const double* __restrict__ gzzy,
    const double* __restrict__ gxxz, const double* __restrict__ gxyz,
    const double* __restrict__ gxzz, const double* __restrict__ gyyz,
    const double* __restrict__ gyzz, const double* __restrict__ gzzz,
    double* __restrict__ Gmx_Res, double* __restrict__ Gmy_Res,
    double* __restrict__ Gmz_Res)
{
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i], uxy=gupxy[i], uxz=gupxz[i];
        double uyy=gupyy[i], uyz=gupyz[i], uzz=gupzz[i];

        Gmx_Res[i] = Gamx[i] - (
            uxx*(uxx*gxxx[i]+uxy*gxyx[i]+uxz*gxzx[i]) +
            uxy*(uxx*gxyx[i]+uxy*gyyx[i]+uxz*gyzx[i]) +
            uxz*(uxx*gxzx[i]+uxy*gyzx[i]+uxz*gzzx[i]) +
            uxx*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) +
            uxy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) +
            uxz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) +
            uxx*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
            uxy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
            uxz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));

        Gmy_Res[i] = Gamy[i] - (
            uxx*(uxy*gxxx[i]+uyy*gxyx[i]+uyz*gxzx[i]) +
            uxy*(uxy*gxyx[i]+uyy*gyyx[i]+uyz*gyzx[i]) +
            uxz*(uxy*gxzx[i]+uyy*gyzx[i]+uyz*gzzx[i]) +
            uxy*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) +
            uyy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) +
            uyz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) +
            uxy*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
            uyy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
            uyz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));

        Gmz_Res[i] = Gamz[i] - (
            uxx*(uxz*gxxx[i]+uyz*gxyx[i]+uzz*gxzx[i]) +
            uxy*(uxz*gxyx[i]+uyz*gyyx[i]+uzz*gyzx[i]) +
            uxz*(uxz*gxzx[i]+uyz*gyzx[i]+uzz*gzzx[i]) +
            uxy*(uxz*gxxy[i]+uyz*gxyy[i]+uzz*gxzy[i]) +
            uyy*(uxz*gxyy[i]+uyz*gyyy[i]+uzz*gyzy[i]) +
            uyz*(uxz*gxzy[i]+uyz*gyzy[i]+uzz*gzzy[i]) +
            uxz*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
            uyz*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
            uzz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
    }
}

/* Phase 4: 18 Christoffel symbols */
__global__ __launch_bounds__(128, 4)
void kern_phase4_christoffel(
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ gxxx, const double* __restrict__ gxyx,
    const double* __restrict__ gxzx, const double* __restrict__ gyyx,
    const double* __restrict__ gyzx, const double* __restrict__ gzzx,
    const double* __restrict__ gxxy, const double* __restrict__ gxyy,
    const double* __restrict__ gxzy, const double* __restrict__ gyyy,
    const double* __restrict__ gyzy, const double* __restrict__ gzzy,
    const double* __restrict__ gxxz, const double* __restrict__ gxyz,
    const double* __restrict__ gxzz, const double* __restrict__ gyyz,
    const double* __restrict__ gyzz, const double* __restrict__ gzzz,
    double* __restrict__ Gxxx, double* __restrict__ Gxxy, double* __restrict__ Gxxz,
    double* __restrict__ Gxyy, double* __restrict__ Gxyz, double* __restrict__ Gxzz,
    double* __restrict__ Gyxx, double* __restrict__ Gyxy, double* __restrict__ Gyxz,
    double* __restrict__ Gyyy, double* __restrict__ Gyyz, double* __restrict__ Gyzz,
    double* __restrict__ Gzxx, double* __restrict__ Gzxy, double* __restrict__ Gzxz,
    double* __restrict__ Gzyy, double* __restrict__ Gzyz, double* __restrict__ Gzzz_o)
{
    const double H = 0.5, TWO = 2.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        /* Gamma^x_{xx} */
        Gxxx[i]=H*(uxx*gxxx[i]+uxy*(TWO*gxyx[i]-gxxy[i])+uxz*(TWO*gxzx[i]-gxxz[i]));
        Gyxx[i]=H*(uxy*gxxx[i]+uyy*(TWO*gxyx[i]-gxxy[i])+uyz*(TWO*gxzx[i]-gxxz[i]));
        Gzxx[i]=H*(uxz*gxxx[i]+uyz*(TWO*gxyx[i]-gxxy[i])+uzz*(TWO*gxzx[i]-gxxz[i]));
        /* yy */
        Gxyy[i]=H*(uxx*(TWO*gxyy[i]-gyyx[i])+uxy*gyyy[i]+uxz*(TWO*gyzy[i]-gyyz[i]));
        Gyyy[i]=H*(uxy*(TWO*gxyy[i]-gyyx[i])+uyy*gyyy[i]+uyz*(TWO*gyzy[i]-gyyz[i]));
        Gzyy[i]=H*(uxz*(TWO*gxyy[i]-gyyx[i])+uyz*gyyy[i]+uzz*(TWO*gyzy[i]-gyyz[i]));
        /* zz */
        Gxzz[i]=H*(uxx*(TWO*gxzz[i]-gzzx[i])+uxy*(TWO*gyzz[i]-gzzy[i])+uxz*gzzz[i]);
        Gyzz[i]=H*(uxy*(TWO*gxzz[i]-gzzx[i])+uyy*(TWO*gyzz[i]-gzzy[i])+uyz*gzzz[i]);
        Gzzz_o[i]=H*(uxz*(TWO*gxzz[i]-gzzx[i])+uyz*(TWO*gyzz[i]-gzzy[i])+uzz*gzzz[i]);
        /* xy */
        Gxxy[i]=H*(uxx*gxxy[i]+uxy*gyyx[i]+uxz*(gxzy[i]+gyzx[i]-gxyz[i]));
        Gyxy[i]=H*(uxy*gxxy[i]+uyy*gyyx[i]+uyz*(gxzy[i]+gyzx[i]-gxyz[i]));
        Gzxy[i]=H*(uxz*gxxy[i]+uyz*gyyx[i]+uzz*(gxzy[i]+gyzx[i]-gxyz[i]));
        /* xz */
        Gxxz[i]=H*(uxx*gxxz[i]+uxy*(gxyz[i]+gyzx[i]-gxzy[i])+uxz*gzzx[i]);
        Gyxz[i]=H*(uxy*gxxz[i]+uyy*(gxyz[i]+gyzx[i]-gxzy[i])+uyz*gzzx[i]);
        Gzxz[i]=H*(uxz*gxxz[i]+uyz*(gxyz[i]+gyzx[i]-gxzy[i])+uzz*gzzx[i]);
        /* yz */
        Gxyz[i]=H*(uxx*(gxyz[i]+gxzy[i]-gyzx[i])+uxy*gyyz[i]+uxz*gzzy[i]);
        Gyyz[i]=H*(uxy*(gxyz[i]+gxzy[i]-gyzx[i])+uyy*gyyz[i]+uyz*gzzy[i]);
        Gzyz[i]=H*(uxz*(gxyz[i]+gxzy[i]-gyzx[i])+uyz*gyyz[i]+uzz*gzzy[i]);
    }
}

/* Phase 5: A^ij = gup^ia gup^jb A_ab  (stored temporarily in Rxx..Rzz) */
__global__ void kern_phase5_raise_A(
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Axx, const double* __restrict__ Axy,
    const double* __restrict__ Axz, const double* __restrict__ Ayy,
    const double* __restrict__ Ayz, const double* __restrict__ Azz,
    double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz,
    double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz)
{
    const double TWO = 2.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        Rxx[i]=uxx*uxx*Axx[i]+uxy*uxy*Ayy[i]+uxz*uxz*Azz[i]
              +TWO*(uxx*uxy*Axy[i]+uxx*uxz*Axz[i]+uxy*uxz*Ayz[i]);
        Ryy[i]=uxy*uxy*Axx[i]+uyy*uyy*Ayy[i]+uyz*uyz*Azz[i]
              +TWO*(uxy*uyy*Axy[i]+uxy*uyz*Axz[i]+uyy*uyz*Ayz[i]);
        Rzz[i]=uxz*uxz*Axx[i]+uyz*uyz*Ayy[i]+uzz*uzz*Azz[i]
              +TWO*(uxz*uyz*Axy[i]+uxz*uzz*Axz[i]+uyz*uzz*Ayz[i]);
        Rxy[i]=uxx*uxy*Axx[i]+uxy*uyy*Ayy[i]+uxz*uyz*Azz[i]
              +(uxx*uyy+uxy*uxy)*Axy[i]+(uxx*uyz+uxz*uxy)*Axz[i]+(uxy*uyz+uxz*uyy)*Ayz[i];
        Rxz[i]=uxx*uxz*Axx[i]+uxy*uyz*Ayy[i]+uxz*uzz*Azz[i]
              +(uxx*uyz+uxy*uxz)*Axy[i]+(uxx*uzz+uxz*uxz)*Axz[i]+(uxy*uzz+uxz*uyz)*Ayz[i];
        Ryz[i]=uxy*uxz*Axx[i]+uyy*uyz*Ayy[i]+uyz*uzz*Azz[i]
              +(uxy*uyz+uyy*uxz)*Axy[i]+(uxy*uzz+uyz*uxz)*Axz[i]+(uyy*uzz+uyz*uyz)*Ayz[i];
    }
}

/* Phase 6: Gamma_rhs part 1 (before fdderivs(beta) and fderivs(Gamma)) */
__global__ __launch_bounds__(128, 4)
void kern_phase6_gamma_rhs_part1(
    const double* __restrict__ Lapx, const double* __restrict__ Lapy,
    const double* __restrict__ Lapz,
    const double* __restrict__ alpn1, const double* __restrict__ chin1,
    const double* __restrict__ chix, const double* __restrict__ chiy,
    const double* __restrict__ chiz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Kx, const double* __restrict__ Ky,
    const double* __restrict__ Kz,
    const double* __restrict__ Sx, const double* __restrict__ Sy,
    const double* __restrict__ Sz,
    const double* __restrict__ Rxx, const double* __restrict__ Rxy,
    const double* __restrict__ Rxz, const double* __restrict__ Ryy,
    const double* __restrict__ Ryz, const double* __restrict__ Rzz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
    double* __restrict__ Gamz_rhs)
{
    const double TWO=2.0, F3o2=1.5, F2o3=2.0/3.0, EIGHT=8.0;
    const double PI_V = 3.14159265358979323846;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i];
        double a=alpn1[i], c1=chin1[i];
        double cx=chix[i],cy=chiy[i],cz=chiz[i];

        Gamx_rhs[i] = -TWO*(lx*Rxx[i]+ly*Rxy[i]+lz*Rxz[i])
            + TWO*a*(
                -F3o2/c1*(cx*Rxx[i]+cy*Rxy[i]+cz*Rxz[i])
                -uxx*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
                -uxy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
                -uxz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
                +Gxxx[i]*Rxx[i]+Gxyy[i]*Ryy[i]+Gxzz[i]*Rzz[i]
                +TWO*(Gxxy[i]*Rxy[i]+Gxxz[i]*Rxz[i]+Gxyz[i]*Ryz[i]));

        Gamy_rhs[i] = -TWO*(lx*Rxy[i]+ly*Ryy[i]+lz*Ryz[i])
            + TWO*a*(
                -F3o2/c1*(cx*Rxy[i]+cy*Ryy[i]+cz*Ryz[i])
                -uxy*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
                -uyy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
                -uyz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
                +Gyxx[i]*Rxx[i]+Gyyy[i]*Ryy[i]+Gyzz[i]*Rzz[i]
                +TWO*(Gyxy[i]*Rxy[i]+Gyxz[i]*Rxz[i]+Gyyz[i]*Ryz[i]));

        Gamz_rhs[i] = -TWO*(lx*Rxz[i]+ly*Ryz[i]+lz*Rzz[i])
            + TWO*a*(
                -F3o2/c1*(cx*Rxz[i]+cy*Ryz[i]+cz*Rzz[i])
                -uxz*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
                -uyz*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
                -uzz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
                +Gzxx[i]*Rxx[i]+Gzyy[i]*Ryy[i]+Gzzz[i]*Rzz[i]
                +TWO*(Gzxy[i]*Rxy[i]+Gzxz[i]*Rxz[i]+Gzyz[i]*Ryz[i]));
    }
}

/* Phase 5+6 fused: raise A^ij in registers, then consume immediately in Gamma_rhs. */
__global__ __launch_bounds__(128, 4)
void kern_phase5_6_gamma_rhs_part1_fused(
    const double* __restrict__ Lapx, const double* __restrict__ Lapy,
    const double* __restrict__ Lapz,
    const double* __restrict__ alpn1, const double* __restrict__ chin1,
    const double* __restrict__ chix, const double* __restrict__ chiy,
    const double* __restrict__ chiz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Axx, const double* __restrict__ Axy,
    const double* __restrict__ Axz, const double* __restrict__ Ayy,
    const double* __restrict__ Ayz, const double* __restrict__ Azz,
    const double* __restrict__ Kx, const double* __restrict__ Ky,
    const double* __restrict__ Kz,
    const double* __restrict__ Sx, const double* __restrict__ Sy,
    const double* __restrict__ Sz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
    double* __restrict__ Gamz_rhs)
{
    const double TWO = 2.0, F3o2 = 1.5, F2o3 = 2.0 / 3.0, EIGHT = 8.0;
    const double PI_V = 3.14159265358979323846;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) {
        const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
        const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
        const double Axx_v = Axx[i], Axy_v = Axy[i], Axz_v = Axz[i];
        const double Ayy_v = Ayy[i], Ayz_v = Ayz[i], Azz_v = Azz[i];

        const double Rxx_v = uxx * uxx * Axx_v + uxy * uxy * Ayy_v + uxz * uxz * Azz_v
                           + TWO * (uxx * uxy * Axy_v + uxx * uxz * Axz_v + uxy * uxz * Ayz_v);
        const double Ryy_v = uxy * uxy * Axx_v + uyy * uyy * Ayy_v + uyz * uyz * Azz_v
                           + TWO * (uxy * uyy * Axy_v + uxy * uyz * Axz_v + uyy * uyz * Ayz_v);
        const double Rzz_v = uxz * uxz * Axx_v + uyz * uyz * Ayy_v + uzz * uzz * Azz_v
                           + TWO * (uxz * uyz * Axy_v + uxz * uzz * Axz_v + uyz * uzz * Ayz_v);
        const double Rxy_v = uxx * uxy * Axx_v + uxy * uyy * Ayy_v + uxz * uyz * Azz_v
                           + (uxx * uyy + uxy * uxy) * Axy_v
                           + (uxx * uyz + uxz * uxy) * Axz_v
                           + (uxy * uyz + uxz * uyy) * Ayz_v;
        const double Rxz_v = uxx * uxz * Axx_v + uxy * uyz * Ayy_v + uxz * uzz * Azz_v
                           + (uxx * uyz + uxy * uxz) * Axy_v
                           + (uxx * uzz + uxz * uxz) * Axz_v
                           + (uxy * uzz + uxz * uyz) * Ayz_v;
        const double Ryz_v = uxy * uxz * Axx_v + uyy * uyz * Ayy_v + uyz * uzz * Azz_v
                           + (uxy * uyz + uyy * uxz) * Axy_v
                           + (uxy * uzz + uyz * uxz) * Axz_v
                           + (uyy * uzz + uyz * uyz) * Ayz_v;

        const double lx = Lapx[i], ly = Lapy[i], lz = Lapz[i];
        const double a = alpn1[i], c1 = chin1[i];
        const double cx = chix[i], cy = chiy[i], cz = chiz[i];

        Gamx_rhs[i] = -TWO * (lx * Rxx_v + ly * Rxy_v + lz * Rxz_v)
            + TWO * a * (
                -F3o2 / c1 * (cx * Rxx_v + cy * Rxy_v + cz * Rxz_v)
                -uxx * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i])
                -uxy * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i])
                -uxz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i])
                + Gxxx[i] * Rxx_v + Gxyy[i] * Ryy_v + Gxzz[i] * Rzz_v
                + TWO * (Gxxy[i] * Rxy_v + Gxxz[i] * Rxz_v + Gxyz[i] * Ryz_v));

        Gamy_rhs[i] = -TWO * (lx * Rxy_v + ly * Ryy_v + lz * Ryz_v)
            + TWO * a * (
                -F3o2 / c1 * (cx * Rxy_v + cy * Ryy_v + cz * Ryz_v)
                -uxy * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i])
                -uyy * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i])
                -uyz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i])
                + Gyxx[i] * Rxx_v + Gyyy[i] * Ryy_v + Gyzz[i] * Rzz_v
                + TWO * (Gyxy[i] * Rxy_v + Gyxz[i] * Rxz_v + Gyyz[i] * Ryz_v));

        Gamz_rhs[i] = -TWO * (lx * Rxz_v + ly * Ryz_v + lz * Rzz_v)
            + TWO * a * (
                -F3o2 / c1 * (cx * Rxz_v + cy * Ryz_v + cz * Rzz_v)
                -uxz * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i])
                -uyz * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i])
                -uzz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i])
                + Gzxx[i] * Rxx_v + Gzyy[i] * Ryy_v + Gzzz[i] * Rzz_v
                + TWO * (Gzxy[i] * Rxy_v + Gzxz[i] * Rxz_v + Gzyz[i] * Ryz_v));
    }
}

/* Phase 8: Gamma_rhs part 2 — after fdderivs(beta) and fderivs(Gamma)
 * Computes: fxx=div(beta_xx), Gamxa, then updates Gamx_rhs etc.
 * Input arrays gxxx..gzzz here hold fdderivs(beta) results,
 * Gamxx..Gamzz hold fderivs(Gamma) results.
 */
__global__ __launch_bounds__(128, 4)
void kern_phase8_gamma_rhs_part2(
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    /* fdderivs(betax) -> gxxx,gxyx,gxzx,gyyx,gyzx,gzzx */
    const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy,
    const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy,
    const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz,
    /* fdderivs(betay) -> gxxy,gxyy,gxzy,gyyy,gyzy,gzzy */
    const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy,
    const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy,
    const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz,
    /* fdderivs(betaz) -> gxxz,gxyz,gxzz,gyyz,gyzz,gzzz */
    const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy,
    const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy,
    const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz,
    /* fderivs(Gamx) -> Gamxx,Gamxy,Gamxz */
    const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
    const double* __restrict__ Gamxz,
    /* fderivs(Gamy) -> Gamyx,Gamyy,Gamyz */
    const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
    const double* __restrict__ Gamyz_d,
    /* fderivs(Gamz) -> Gamzx,Gamzy,Gamzz */
    const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
    const double* __restrict__ Gamzz_d,
    /* Christoffel symbols */
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    /* betaij first derivs */
    const double* __restrict__ betaxx, const double* __restrict__ betaxy,
    const double* __restrict__ betaxz, const double* __restrict__ betayx,
    const double* __restrict__ betayy, const double* __restrict__ betayz,
    const double* __restrict__ betazx, const double* __restrict__ betazy,
    const double* __restrict__ betazz,
    double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
    double* __restrict__ Gamz_rhs,
    double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out,
    double* __restrict__ Gamza_out)
{
    const double TWO=2.0, F2o3=2.0/3.0, F1o3=1.0/3.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        /* div(beta_second_derivs) */
        double fxx_v = bxx_xx[i]+bxy_xy[i]+bxz_xz[i];
        double fxy_v = bxx_xy[i]+bxy_yy[i]+bxz_yz[i];
        double fxz_v = bxx_xz[i]+bxy_yz[i]+bxz_zz[i];
        /* Gamma^a contracted */
        double Ga_x = uxx*Gxxx[i]+uyy*Gxyy[i]+uzz*Gxzz[i]
                     +TWO*(uxy*Gxxy[i]+uxz*Gxxz[i]+uyz*Gxyz[i]);
        double Ga_y = uxx*Gyxx[i]+uyy*Gyyy[i]+uzz*Gyzz[i]
                     +TWO*(uxy*Gyxy[i]+uxz*Gyxz[i]+uyz*Gyyz[i]);
        double Ga_z = uxx*Gzxx[i]+uyy*Gzyy[i]+uzz*Gzzz[i]
                     +TWO*(uxy*Gzxy[i]+uxz*Gzxz[i]+uyz*Gzyz[i]);
        Gamxa_out[i]=Ga_x; Gamya_out[i]=Ga_y; Gamza_out[i]=Ga_z;
        double db = betaxx[i] + betayy[i] + betazz[i];
        Gamx_rhs[i] += F2o3*Ga_x*db
            - Ga_x*betaxx[i] - Ga_y*betaxy[i] - Ga_z*betaxz[i]
            + F1o3*(uxx*fxx_v+uxy*fxy_v+uxz*fxz_v)
            + uxx*bxx_xx[i]+uyy*bxx_yy[i]+uzz*bxx_zz[i]
            + TWO*(uxy*bxx_xy[i]+uxz*bxx_xz[i]+uyz*bxx_yz[i]);
        Gamy_rhs[i] += F2o3*Ga_y*db
            - Ga_x*betayx[i] - Ga_y*betayy[i] - Ga_z*betayz[i]
            + F1o3*(uxy*fxx_v+uyy*fxy_v+uyz*fxz_v)
            + uxx*bxy_xx[i]+uyy*bxy_yy[i]+uzz*bxy_zz[i]
            + TWO*(uxy*bxy_xy[i]+uxz*bxy_xz[i]+uyz*bxy_yz[i]);
        Gamz_rhs[i] += F2o3*Ga_z*db
            - Ga_x*betazx[i] - Ga_y*betazy[i] - Ga_z*betazz[i]
            + F1o3*(uxz*fxx_v+uyz*fxy_v+uzz*fxz_v)
            + uxx*bxz_xx[i]+uyy*bxz_yy[i]+uzz*bxz_zz[i]
            + TWO*(uxy*bxz_xy[i]+uxz*bxz_xz[i]+uyz*bxz_yz[i]);
    }
}

/* Phase 9: Christoffel contract — compute g_{ia} Gamma^a_{bc} products
 * Overwrites gxxx..gzzz with lowered Christoffel products needed for Ricci.
 */
__global__ __launch_bounds__(128, 4)
void kern_phase9_christoffel_contract(
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    /* output: lowered products g_{ia} Gamma^a_{bc} */
    double* __restrict__ o_gxxx, double* __restrict__ o_gxyx,
    double* __restrict__ o_gxzx, double* __restrict__ o_gyyx,
    double* __restrict__ o_gyzx, double* __restrict__ o_gzzx,
    double* __restrict__ o_gxxy, double* __restrict__ o_gxyy,
    double* __restrict__ o_gxzy, double* __restrict__ o_gyyy,
    double* __restrict__ o_gyzy, double* __restrict__ o_gzzy,
    double* __restrict__ o_gxxz, double* __restrict__ o_gxyz,
    double* __restrict__ o_gxzz, double* __restrict__ o_gyyz,
    double* __restrict__ o_gyzz, double* __restrict__ o_gzzz)
{
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double g11=gxx[i],g12=gxy[i],g13=gxz[i];
        double g22=gyy[i],g23=gyz[i],g33=gzz[i];
        /* row x: g_{x,a} Gamma^a_{bc} */
        o_gxxx[i]=g11*Gxxx[i]+g12*Gyxx[i]+g13*Gzxx[i];
        o_gxyx[i]=g11*Gxxy[i]+g12*Gyxy[i]+g13*Gzxy[i];
        o_gxzx[i]=g11*Gxxz[i]+g12*Gyxz[i]+g13*Gzxz[i];
        o_gyyx[i]=g11*Gxyy[i]+g12*Gyyy[i]+g13*Gzyy[i];
        o_gyzx[i]=g11*Gxyz[i]+g12*Gyyz[i]+g13*Gzyz[i];
        o_gzzx[i]=g11*Gxzz[i]+g12*Gyzz[i]+g13*Gzzz[i];
        /* row y: g_{y,a} Gamma^a_{bc} */
        o_gxxy[i]=g12*Gxxx[i]+g22*Gyxx[i]+g23*Gzxx[i];
        o_gxyy[i]=g12*Gxxy[i]+g22*Gyxy[i]+g23*Gzxy[i];
        o_gxzy[i]=g12*Gxxz[i]+g22*Gyxz[i]+g23*Gzxz[i];
        o_gyyy[i]=g12*Gxyy[i]+g22*Gyyy[i]+g23*Gzyy[i];
        o_gyzy[i]=g12*Gxyz[i]+g22*Gyyz[i]+g23*Gzyz[i];
        o_gzzy[i]=g12*Gxzz[i]+g22*Gyzz[i]+g23*Gzzz[i];
        /* row z: g_{z,a} Gamma^a_{bc} */
        o_gxxz[i]=g13*Gxxx[i]+g23*Gyxx[i]+g33*Gzxx[i];
        o_gxyz[i]=g13*Gxxy[i]+g23*Gyxy[i]+g33*Gzxy[i];
        o_gxzz[i]=g13*Gxxz[i]+g23*Gyxz[i]+g33*Gzxz[i];
        o_gyyz[i]=g13*Gxyy[i]+g23*Gyyy[i]+g33*Gzyy[i];
        o_gyzz[i]=g13*Gxyz[i]+g23*Gyyz[i]+g33*Gzyz[i];
        o_gzzz[i]=g13*Gxzz[i]+g23*Gyzz[i]+g33*Gzzz[i];
    }
}

/* Phase 8+9 fused: update Gamma rhs, contract Gamma^a, and lower Christoffels in one pass. */
__global__ __launch_bounds__(128, 2)
void kern_phase8_9_gamma_rhs_contract_fused(
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy,
    const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy,
    const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz,
    const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy,
    const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy,
    const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz,
    const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy,
    const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy,
    const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    const double* __restrict__ betaxx, const double* __restrict__ betaxy,
    const double* __restrict__ betaxz, const double* __restrict__ betayx,
    const double* __restrict__ betayy, const double* __restrict__ betayz,
    const double* __restrict__ betazx, const double* __restrict__ betazy,
    const double* __restrict__ betazz,
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
    double* __restrict__ Gamz_rhs,
    double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out,
    double* __restrict__ Gamza_out,
    double* __restrict__ o_gxxx, double* __restrict__ o_gxyx,
    double* __restrict__ o_gxzx, double* __restrict__ o_gyyx,
    double* __restrict__ o_gyzx, double* __restrict__ o_gzzx,
    double* __restrict__ o_gxxy, double* __restrict__ o_gxyy,
    double* __restrict__ o_gxzy, double* __restrict__ o_gyyy,
    double* __restrict__ o_gyzy, double* __restrict__ o_gzzy,
    double* __restrict__ o_gxxz, double* __restrict__ o_gxyz,
    double* __restrict__ o_gxzz, double* __restrict__ o_gyyz,
    double* __restrict__ o_gyzz, double* __restrict__ o_gzzz)
{
    const double TWO = 2.0, F2o3 = 2.0 / 3.0, F1o3 = 1.0 / 3.0;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) {
        const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
        const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];

        const double Gxxx_v = Gxxx[i], Gxxy_v = Gxxy[i], Gxxz_v = Gxxz[i];
        const double Gxyy_v = Gxyy[i], Gxyz_v = Gxyz[i], Gxzz_v = Gxzz[i];
        const double Gyxx_v = Gyxx[i], Gyxy_v = Gyxy[i], Gyxz_v = Gyxz[i];
        const double Gyyy_v = Gyyy[i], Gyyz_v = Gyyz[i], Gyzz_v = Gyzz[i];
        const double Gzxx_v = Gzxx[i], Gzxy_v = Gzxy[i], Gzxz_v = Gzxz[i];
        const double Gzyy_v = Gzyy[i], Gzyz_v = Gzyz[i], Gzzz_v = Gzzz[i];

        const double fxx_v = bxx_xx[i] + bxy_xy[i] + bxz_xz[i];
        const double fxy_v = bxx_xy[i] + bxy_yy[i] + bxz_yz[i];
        const double fxz_v = bxx_xz[i] + bxy_yz[i] + bxz_zz[i];

        const double Ga_x = uxx * Gxxx_v + uyy * Gxyy_v + uzz * Gxzz_v
                          + TWO * (uxy * Gxxy_v + uxz * Gxxz_v + uyz * Gxyz_v);
        const double Ga_y = uxx * Gyxx_v + uyy * Gyyy_v + uzz * Gyzz_v
                          + TWO * (uxy * Gyxy_v + uxz * Gyxz_v + uyz * Gyyz_v);
        const double Ga_z = uxx * Gzxx_v + uyy * Gzyy_v + uzz * Gzzz_v
                          + TWO * (uxy * Gzxy_v + uxz * Gzxz_v + uyz * Gzyz_v);

        Gamxa_out[i] = Ga_x;
        Gamya_out[i] = Ga_y;
        Gamza_out[i] = Ga_z;

        const double betaxx_v = betaxx[i], betaxy_v = betaxy[i], betaxz_v = betaxz[i];
        const double betayx_v = betayx[i], betayy_v = betayy[i], betayz_v = betayz[i];
        const double betazx_v = betazx[i], betazy_v = betazy[i], betazz_v = betazz[i];
        const double db = betaxx_v + betayy_v + betazz_v;

        Gamx_rhs[i] += F2o3 * Ga_x * db
            - Ga_x * betaxx_v - Ga_y * betaxy_v - Ga_z * betaxz_v
            + F1o3 * (uxx * fxx_v + uxy * fxy_v + uxz * fxz_v)
            + uxx * bxx_xx[i] + uyy * bxx_yy[i] + uzz * bxx_zz[i]
            + TWO * (uxy * bxx_xy[i] + uxz * bxx_xz[i] + uyz * bxx_yz[i]);
        Gamy_rhs[i] += F2o3 * Ga_y * db
            - Ga_x * betayx_v - Ga_y * betayy_v - Ga_z * betayz_v
            + F1o3 * (uxy * fxx_v + uyy * fxy_v + uyz * fxz_v)
            + uxx * bxy_xx[i] + uyy * bxy_yy[i] + uzz * bxy_zz[i]
            + TWO * (uxy * bxy_xy[i] + uxz * bxy_xz[i] + uyz * bxy_yz[i]);
        Gamz_rhs[i] += F2o3 * Ga_z * db
            - Ga_x * betazx_v - Ga_y * betazy_v - Ga_z * betazz_v
            + F1o3 * (uxz * fxx_v + uyz * fxy_v + uzz * fxz_v)
            + uxx * bxz_xx[i] + uyy * bxz_yy[i] + uzz * bxz_zz[i]
            + TWO * (uxy * bxz_xy[i] + uxz * bxz_xz[i] + uyz * bxz_yz[i]);

        const double g11 = gxx[i], g12 = gxy[i], g13 = gxz[i];
        const double g22 = gyy[i], g23 = gyz[i], g33 = gzz[i];

        o_gxxx[i] = g11 * Gxxx_v + g12 * Gyxx_v + g13 * Gzxx_v;
        o_gxyx[i] = g11 * Gxxy_v + g12 * Gyxy_v + g13 * Gzxy_v;
        o_gxzx[i] = g11 * Gxxz_v + g12 * Gyxz_v + g13 * Gzxz_v;
        o_gyyx[i] = g11 * Gxyy_v + g12 * Gyyy_v + g13 * Gzyy_v;
        o_gyzx[i] = g11 * Gxyz_v + g12 * Gyyz_v + g13 * Gzyz_v;
        o_gzzx[i] = g11 * Gxzz_v + g12 * Gyzz_v + g13 * Gzzz_v;

        o_gxxy[i] = g12 * Gxxx_v + g22 * Gyxx_v + g23 * Gzxx_v;
        o_gxyy[i] = g12 * Gxxy_v + g22 * Gyxy_v + g23 * Gzxy_v;
        o_gxzy[i] = g12 * Gxxz_v + g22 * Gyxz_v + g23 * Gzxz_v;
        o_gyyy[i] = g12 * Gxyy_v + g22 * Gyyy_v + g23 * Gzyy_v;
        o_gyzy[i] = g12 * Gxyz_v + g22 * Gyyz_v + g23 * Gzyz_v;
        o_gzzy[i] = g12 * Gxzz_v + g22 * Gyzz_v + g23 * Gzzz_v;

        o_gxxz[i] = g13 * Gxxx_v + g23 * Gyxx_v + g33 * Gzxx_v;
        o_gxyz[i] = g13 * Gxxy_v + g23 * Gyxy_v + g33 * Gzxy_v;
        o_gxzz[i] = g13 * Gxxz_v + g23 * Gyxz_v + g33 * Gzxz_v;
        o_gyyz[i] = g13 * Gxyy_v + g23 * Gyyy_v + g33 * Gzyy_v;
        o_gyzz[i] = g13 * Gxyz_v + g23 * Gyyz_v + g33 * Gzyz_v;
        o_gzzz[i] = g13 * Gxzz_v + g23 * Gyzz_v + g33 * Gzzz_v;
    }
}

/* Phase 10: After fdderivs of a metric component, contract with gup^{ij}
 * R_comp = gup^xx*fxx + gup^yy*fyy + gup^zz*fzz + 2*(gup^xy*fxy + gup^xz*fxz + gup^yz*fyz)
 */
__global__ void kern_phase10_ricci_contract(
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ fxx, const double* __restrict__ fxy,
    const double* __restrict__ fxz, const double* __restrict__ fyy,
    const double* __restrict__ fyz, const double* __restrict__ fzz,
    double* __restrict__ R_comp)
{
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        R_comp[i] = gupxx[i]*fxx[i] + gupyy[i]*fyy[i] + gupzz[i]*fzz[i]
                  + 2.0*(gupxy[i]*fxy[i] + gupxz[i]*fxz[i] + gupyz[i]*fyz[i]);
    }
}

/* Phase 11a: Ricci diagonal assembly (Rxx, Ryy, Rzz) */
__global__ __launch_bounds__(128, 4)
void kern_phase11_ricci_diag(
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
    const double* __restrict__ Gamza,
    const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
    const double* __restrict__ Gamxz,
    const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
    const double* __restrict__ Gamyz_d,
    const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
    const double* __restrict__ Gamzz_d,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    /* lowered Christoffel products */
    const double* __restrict__ lxxx, const double* __restrict__ lxyx,
    const double* __restrict__ lxzx, const double* __restrict__ lyyx,
    const double* __restrict__ lyzx, const double* __restrict__ lzzx,
    const double* __restrict__ lxxy, const double* __restrict__ lxyy,
    const double* __restrict__ lxzy, const double* __restrict__ lyyy,
    const double* __restrict__ lyzy, const double* __restrict__ lzzy,
    const double* __restrict__ lxxz, const double* __restrict__ lxyz,
    const double* __restrict__ lxzz, const double* __restrict__ lyyz,
    const double* __restrict__ lyzz, const double* __restrict__ lzzz,
    double* __restrict__ Rxx, double* __restrict__ Ryy, double* __restrict__ Rzz)
{
    const double H = 0.5, TWO = 2.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        /* Rxx */
        Rxx[i] = -H*Rxx[i]
            + gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i]
            + Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i]
            + uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i])
                  +(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i]))
            + uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i]
                       +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i])
                  +(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i])
                  +(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]))
            + uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i]
                       +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i])
                  +(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i])
                  +(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]))
            + uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i])
                  +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
            + uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
                       +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i])
                  +(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i])
                  +(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]))
            + uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i])
                  +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]));

        /* Ryy */
        Ryy[i] = -H*Ryy[i]
            + gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i]
            + Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i]
            + uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i])
                  +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
            + uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
                       +Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i])
                  +(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i])
                  +(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]))
            + uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
                       +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i])
                  +(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i])
                  +(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]))
            + uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i])
                  +(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i]))
            + uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i]
                       +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i])
                  +(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i])
                  +(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]))
            + uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i])
                  +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]));

        /* Rzz */
        Rzz[i] = -H*Rzz[i]
            + gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i]
            + Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i]
            + uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i])
                  +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]))
            + uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
                       +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i])
                  +(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i])
                  +(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]))
            + uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
                       +Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i])
                  +(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i])
                  +(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]))
            + uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i])
                  +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]))
            + uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
                       +Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i])
                  +(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i])
                  +(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]))
            + uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i])
                  +(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i]));
    }
}

/* Phase 11b: Ricci off-diagonal assembly (Rxy, Rxz, Ryz) */
__global__ __launch_bounds__(128, 4)
void kern_phase11_ricci_offdiag(
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
    const double* __restrict__ Gamza,
    const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
    const double* __restrict__ Gamxz,
    const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
    const double* __restrict__ Gamyz_d,
    const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
    const double* __restrict__ Gamzz_d,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    const double* __restrict__ lxxx, const double* __restrict__ lxyx,
    const double* __restrict__ lxzx, const double* __restrict__ lyyx,
    const double* __restrict__ lyzx, const double* __restrict__ lzzx,
    const double* __restrict__ lxxy, const double* __restrict__ lxyy,
    const double* __restrict__ lxzy, const double* __restrict__ lyyy,
    const double* __restrict__ lyzy, const double* __restrict__ lzzy,
    const double* __restrict__ lxxz, const double* __restrict__ lxyz,
    const double* __restrict__ lxzz, const double* __restrict__ lyyz,
    const double* __restrict__ lyzz, const double* __restrict__ lzzz,
    double* __restrict__ Rxy, double* __restrict__ Rxz, double* __restrict__ Ryz)
{
    const double H = 0.5, TWO = 2.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];

        /* Rxy */
        Rxy[i] = H*(
            -Rxy[i]
            +gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i]
            +gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i]
            +Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i]
            +Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i])
            +uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i]
                  +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]
                  +Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])
            +uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i]
                  +Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]
                  +Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]
                  +Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]
                  +Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i]
                  +Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i])
            +uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i]
                  +Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
                  +Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]
                  +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
                  +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
                  +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
            +uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
                  +Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i]
                  +Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])
            +uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
                  +Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i]
                  +Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i]
                  +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
                  +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
                  +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
            +uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
                  +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
                  +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]);

        /* Rxz */
        Rxz[i] = H*(
            -Rxz[i]
            +gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i]
            +gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i]
            +Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i]
            +Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i])
            +uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i]
                  +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]
                  +Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])
            +uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i]
                  +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]
                  +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]
                  +Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
                  +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
                  +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
            +uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i]
                  +Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]
                  +Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]
                  +Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]
                  +Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i]
                  +Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i])
            +uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
                  +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
                  +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
            +uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
                  +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
                  +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]
                  +Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
                  +Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i]
                  +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
            +uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
                  +Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i]
                  +Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]);

        /* Ryz */
        Ryz[i] = H*(
            -Ryz[i]
            +gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i]
            +gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i]
            +Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i]
            +Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i])
            +uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
                  +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
                  +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])
            +uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
                  +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
                  +Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i]
                  +Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i]
                  +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]
                  +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
            +uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
                  +Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
                  +Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]
                  +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]
                  +Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i]
                  +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
            +uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i]
                  +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]
                  +Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])
            +uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i]
                  +Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]
                  +Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]
                  +Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]
                  +Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i]
                  +Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i])
            +uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
                  +Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i]
                  +Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]);
    }
}

/* Phase 11: fused Ricci assembly (diag + off-diag) */
__global__ __launch_bounds__(128, 2)
void kern_phase11_ricci_fused(
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
    const double* __restrict__ Gamza,
    const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
    const double* __restrict__ Gamxz,
    const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
    const double* __restrict__ Gamyz_d,
    const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
    const double* __restrict__ Gamzz_d,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    const double* __restrict__ lxxx, const double* __restrict__ lxyx,
    const double* __restrict__ lxzx, const double* __restrict__ lyyx,
    const double* __restrict__ lyzx, const double* __restrict__ lzzx,
    const double* __restrict__ lxxy, const double* __restrict__ lxyy,
    const double* __restrict__ lxzy, const double* __restrict__ lyyy,
    const double* __restrict__ lyzy, const double* __restrict__ lzzy,
    const double* __restrict__ lxxz, const double* __restrict__ lxyz,
    const double* __restrict__ lxzz, const double* __restrict__ lyyz,
    const double* __restrict__ lyzz, const double* __restrict__ lzzz,
    double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz,
    double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz)
{
    const double H = 0.5, TWO = 2.0;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) {
        double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
        double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];

        Rxx[i] = -H*Rxx[i]
            + gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i]
            + Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i]
            + uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i])
                  +(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i]))
            + uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i]
                       +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i])
                  +(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i])
                  +(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]))
            + uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i]
                       +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i])
                  +(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i])
                  +(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]))
            + uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i])
                  +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
            + uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
                       +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i])
                  +(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i])
                  +(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]))
            + uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i])
                  +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]));

        Ryy[i] = -H*Ryy[i]
            + gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i]
            + Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i]
            + uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i])
                  +(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
            + uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
                       +Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i])
                  +(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i])
                  +(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]))
            + uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
                       +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i])
                  +(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i])
                  +(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]))
            + uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i])
                  +(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i]))
            + uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i]
                       +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i])
                  +(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i])
                  +(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]))
            + uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i])
                  +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]));

        Rzz[i] = -H*Rzz[i]
            + gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i]
            + Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i]
            + uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i])
                  +(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]))
            + uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
                       +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i])
                  +(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i])
                  +(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]))
            + uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
                       +Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i])
                  +(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i])
                  +(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]))
            + uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i])
                  +(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]))
            + uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
                       +Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i])
                  +(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i])
                  +(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]))
            + uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i])
                  +(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i]));

        Rxy[i] = H*(
            -Rxy[i]
            +gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i]
            +gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i]
            +Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i]
            +Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i])
            +uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i]
                  +Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]
                  +Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])
            +uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i]
                  +Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]
                  +Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]
                  +Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]
                  +Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i]
                  +Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i])
            +uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i]
                  +Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
                  +Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]
                  +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
                  +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
                  +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
            +uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
                  +Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i]
                  +Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])
            +uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
                  +Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i]
                  +Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i]
                  +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
                  +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
                  +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
            +uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
                  +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
                  +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]);

        Rxz[i] = H*(
            -Rxz[i]
            +gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i]
            +gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i]
            +Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i]
            +Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i])
            +uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i]
                  +Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]
                  +Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])
            +uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i]
                  +Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]
                  +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]
                  +Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
                  +Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
                  +Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
            +uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i]
                  +Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]
                  +Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]
                  +Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]
                  +Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i]
                  +Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i])
            +uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
                  +Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
                  +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
            +uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
                  +Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
                  +Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]
                  +Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
                  +Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i]
                  +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
            +uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
                  +Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i]
                  +Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]);

        Ryz[i] = H*(
            -Ryz[i]
            +gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i]
            +gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i]
            +Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i]
            +Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i])
            +uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
                  +Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
                  +Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])
            +uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
                  +Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
                  +Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i]
                  +Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i]
                  +Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]
                  +Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
            +uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
                  +Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
                  +Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]
                  +Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]
                  +Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i]
                  +Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
            +uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i]
                  +Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]
                  +Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])
            +uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i]
                  +Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]
                  +Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]
                  +Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]
                  +Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i]
                  +Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i])
            +uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
                  +Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i]
                  +Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]);
    }
}

/* Phase 13: chi correction to Ricci tensor
 * After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
 * then add chi contribution to Rxx..Rzz.
 */
__global__ __launch_bounds__(128, 4)
void kern_phase12_13_chi_correction_fused(
    const double* __restrict__ chi,
    const double* __restrict__ chin1,
    const double* __restrict__ chix, const double* __restrict__ chiy,
    const double* __restrict__ chiz,
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    double* __restrict__ Rxx, double* __restrict__ Rxy,
    double* __restrict__ Rxz, double* __restrict__ Ryy,
    double* __restrict__ Ryz, double* __restrict__ Rzz)
{
    const double TWO = 2.0;
    const double F3o2 = 1.5;
    const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
    const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
    const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    if (tid >= d_gp.all) return;

    const int i0 = tid % nx;
    const int j0 = (tid / nx) % ny;
    const int k0 = tid / (nx * ny);

    double cxx = 0.0, cxy = 0.0, cxz = 0.0;
    double cyy = 0.0, cyz = 0.0, czz = 0.0;

    if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
        const int iF = i0 + 1;
        const int jF = j0 + 1;
        const int kF = k0 + 1;

#if ghost_width != 3
        fd_compute_second6(chi, iF, jF, kF,
                           iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                           1, 1, 1,
                           cxx, cxy, cxz, cyy, cyz, czz);
#else
        if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
            (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
            (kF + 2) <= kmaxF && (kF - 2) >= kminF)
        {
            const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
            cxx = d_gp.Fdxdx * (
                -fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
                +16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
                -30.0 * c
                +16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
                -       fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
            cyy = d_gp.Fdydy * (
                -fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
                +16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
                -30.0 * c
                +16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
                -       fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
            czz = d_gp.Fdzdz * (
                -fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
                +16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
                -30.0 * c
                +16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
                -       fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));

            const double t_jm2 =
                fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1);
            const double t_jm1 =
                fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1);
            const double t_jp1 =
                fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1);
            const double t_jp2 =
                fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1);
            cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);

            const double t_km2_x =
                fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1);
            const double t_km1_x =
                fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1);
            const double t_kp1_x =
                fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1);
            const double t_kp2_x =
                fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1);
            cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);

            const double t_km2_y =
                fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1);
            const double t_km1_y =
                fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1);
            const double t_kp1_y =
                fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1);
            const double t_kp2_y =
                fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1)
              - 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1)
              + 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1)
              -       fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1);
            cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
        }
        else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                 (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                 (kF + 1) <= kmaxF && (kF - 1) >= kminF)
        {
            const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
            cxx = d_gp.Sdxdx * (
                fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
              - 2.0 * c
              + fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
            cyy = d_gp.Sdydy * (
                fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
              - 2.0 * c
              + fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
            czz = d_gp.Sdzdz * (
                fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
              - 2.0 * c
              + fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
            cxy = d_gp.Sdxdy * (
                fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
              - fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
              - fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
              + fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1));
            cxz = d_gp.Sdxdz * (
                fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
              - fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
              - fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
              + fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1));
            cyz = d_gp.Sdydz * (
                fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
              - fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
              - fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
              + fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
        }
#endif
    }

    const double cx = chix[tid];
    const double cy = chiy[tid];
    const double cz = chiz[tid];
    const double c1 = chin1[tid];

    cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz;
    cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz;
    cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz;
    cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz;
    cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz;
    czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz;

    const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid];
    const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid];
    const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx)
                       + uyy * (cyy - F3o2 / c1 * cy * cy)
                       + uzz * (czz - F3o2 / c1 * cz * cz)
                       + TWO * uxy * (cxy - F3o2 / c1 * cx * cy)
                       + TWO * uxz * (cxz - F3o2 / c1 * cx * cz)
                       + TWO * uyz * (cyz - F3o2 / c1 * cy * cz);

    const double inv2c = 1.0 / (c1 * TWO);
    Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c;
    Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c;
    Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c;
    Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c;
    Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c;
    Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c;
}

__global__ __launch_bounds__(128, 4)
void kern_phase13_chi_correction(
    const double* __restrict__ chin1,
    const double* __restrict__ chix, const double* __restrict__ chiy,
    const double* __restrict__ chiz,
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    double* __restrict__ fxx, double* __restrict__ fxy,
    double* __restrict__ fxz, double* __restrict__ fyy,
    double* __restrict__ fyz, double* __restrict__ fzz,
    double* __restrict__ Rxx, double* __restrict__ Rxy,
    double* __restrict__ Rxz, double* __restrict__ Ryy,
    double* __restrict__ Ryz, double* __restrict__ Rzz)
{
    const double H=0.5, TWO=2.0, F3o2=1.5;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        double cx=chix[i],cy=chiy[i],cz=chiz[i],c1=chin1[i];
        /* subtract Christoffel * chi_deriv */
        fxx[i] -= Gxxx[i]*cx+Gyxx[i]*cy+Gzxx[i]*cz;
        fxy[i] -= Gxxy[i]*cx+Gyxy[i]*cy+Gzxy[i]*cz;
        fxz[i] -= Gxxz[i]*cx+Gyxz[i]*cy+Gzxz[i]*cz;
        fyy[i] -= Gxyy[i]*cx+Gyyy[i]*cy+Gzyy[i]*cz;
        fyz[i] -= Gxyz[i]*cx+Gyyz[i]*cy+Gzyz[i]*cz;
        fzz[i] -= Gxzz[i]*cx+Gyzz[i]*cy+Gzzz[i]*cz;

        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        double f_val = uxx*(fxx[i]-F3o2/c1*cx*cx)
                     + uyy*(fyy[i]-F3o2/c1*cy*cy)
                     + uzz*(fzz[i]-F3o2/c1*cz*cz)
                     + TWO*uxy*(fxy[i]-F3o2/c1*cx*cy)
                     + TWO*uxz*(fxz[i]-F3o2/c1*cx*cz)
                     + TWO*uyz*(fyz[i]-F3o2/c1*cy*cz);

        double inv2c = 1.0/(c1*TWO);
        Rxx[i] += (fxx[i]-cx*cx*inv2c+gxx[i]*f_val)*inv2c;
        Ryy[i] += (fyy[i]-cy*cy*inv2c+gyy[i]*f_val)*inv2c;
        Rzz[i] += (fzz[i]-cz*cz*inv2c+gzz[i]*f_val)*inv2c;
        Rxy[i] += (fxy[i]-cx*cy*inv2c+gxy[i]*f_val)*inv2c;
        Rxz[i] += (fxz[i]-cx*cz*inv2c+gxz[i]*f_val)*inv2c;
        Ryz[i] += (fyz[i]-cy*cz*inv2c+gyz[i]*f_val)*inv2c;
    }
}

/* Phase 15: trK_rhs, Aij_rhs, gauge.
 * Also updates Christoffel with physical chi correction and computes Lap second derivatives on the fly.
 */
__global__ __launch_bounds__(128, 4)
void kern_phase15_trK_Aij_gauge(
    const double* __restrict__ alpn1, const double* __restrict__ chin1,
    const double* __restrict__ chix, const double* __restrict__ chiy,
    const double* __restrict__ chiz,
    const double* __restrict__ gxx, const double* __restrict__ gxy,
    const double* __restrict__ gxz, const double* __restrict__ gyy,
    const double* __restrict__ gyz, const double* __restrict__ gzz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ trK,
    const double* __restrict__ Axx, const double* __restrict__ Axy,
    const double* __restrict__ Axz, const double* __restrict__ Ayy,
    const double* __restrict__ Ayz, const double* __restrict__ Azz,
    const double* __restrict__ Lapx, const double* __restrict__ Lapy,
    const double* __restrict__ Lapz,
    const double* __restrict__ betaxx, const double* __restrict__ betaxy,
    const double* __restrict__ betaxz, const double* __restrict__ betayx,
    const double* __restrict__ betayy, const double* __restrict__ betayz,
    const double* __restrict__ betazx, const double* __restrict__ betazy,
    const double* __restrict__ betazz,
    const double* __restrict__ rho,
    const double* __restrict__ Sx_m, const double* __restrict__ Sy_m,
    const double* __restrict__ Sz_m,
    const double* __restrict__ Sxx_m, const double* __restrict__ Sxy_m,
    const double* __restrict__ Sxz_m, const double* __restrict__ Syy_m,
    const double* __restrict__ Syz_m, const double* __restrict__ Szz_m,
    const double* __restrict__ dtSfx, const double* __restrict__ dtSfy,
    const double* __restrict__ dtSfz,
    const double* __restrict__ Rxx, const double* __restrict__ Rxy,
    const double* __restrict__ Rxz, const double* __restrict__ Ryy,
    const double* __restrict__ Ryz, const double* __restrict__ Rzz,
    double* __restrict__ Gxxx, double* __restrict__ Gxxy,
    double* __restrict__ Gxxz, double* __restrict__ Gxyy,
    double* __restrict__ Gxyz_o, double* __restrict__ Gxzz,
    double* __restrict__ Gyxx, double* __restrict__ Gyxy,
    double* __restrict__ Gyxz, double* __restrict__ Gyyy,
    double* __restrict__ Gyyz, double* __restrict__ Gyzz,
    double* __restrict__ Gzxx, double* __restrict__ Gzxy,
    double* __restrict__ Gzxz, double* __restrict__ Gzyy,
    double* __restrict__ Gzyz, double* __restrict__ Gzzz,
    double* __restrict__ dtSfx_rhs, double* __restrict__ dtSfy_rhs,
    double* __restrict__ dtSfz_rhs,
    double* __restrict__ trK_rhs,
    double* __restrict__ Axx_rhs, double* __restrict__ Axy_rhs,
    double* __restrict__ Axz_rhs, double* __restrict__ Ayy_rhs,
    double* __restrict__ Ayz_rhs, double* __restrict__ Azz_rhs,
    double* __restrict__ Lap_rhs,
    double* __restrict__ betax_rhs, double* __restrict__ betay_rhs,
    double* __restrict__ betaz_rhs,
    double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
    double* __restrict__ Gamz_rhs,
    double* __restrict__ f_arr, double* __restrict__ S_arr)
{
    const double TWO=2.0, FOUR=4.0, EIGHT=8.0, H=0.5;
    const double F1o3=1.0/3.0, F2o3=2.0/3.0, F3o2=1.5;
    const double PI_V=3.14159265358979323846;
    const double F16=16.0, F8=8.0;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
        const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
        const int i0 = i % nx;
        const int j0 = (i / nx) % ny;
        const int k0 = i / (nx * ny);
        const int iF = i0 + 1;
        const int jF = j0 + 1;
        const int kF = k0 + 1;
        const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
        const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        double a=alpn1[i], c1=chin1[i];
        double cx=chix[i],cy=chiy[i],cz=chiz[i];
        double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i];
        double fxx_v = 0.0, fxy_v = 0.0, fxz_v = 0.0;
        double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;

        if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
#if ghost_width != 3
            fd_compute_second6(alpn1, iF, jF, kF,
                               iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
                               1, 1, 1,
                               fxx_v, fxy_v, fxz_v, fyy_v, fyz_v, fzz_v);
#else
            if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
                (jF + 2) <= jmaxF && (jF - 2) >= jminF &&
                (kF + 2) <= kmaxF && (kF - 2) >= kminF)
            {
                const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1);
                fxx_v = d_gp.Fdxdx * (
                    -fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF, 1, 1, 1)
                    +16.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1)
                    -30.0 * c
                    +16.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1)
                    -       fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF, 1, 1, 1));
                fyy_v = d_gp.Fdydy * (
                    -fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF, 1, 1, 1)
                    +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1)
                    -30.0 * c
                    +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1)
                    -       fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF, 1, 1, 1));
                fzz_v = d_gp.Fdzdz * (
                    -fetch_sym_ord2_direct(alpn1, iF, jF, kF - 2, 1, 1, 1)
                    +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1)
                    -30.0 * c
                    +16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1)
                    -       fetch_sym_ord2_direct(alpn1, iF, jF, kF + 2, 1, 1, 1));

                const double t_jm2 =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF - 2, kF, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 2, kF, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 2, kF, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF - 2, kF, 1, 1, 1);
                const double t_jm1 =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF - 1, kF, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF - 1, kF, 1, 1, 1);
                const double t_jp1 =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF + 1, kF, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF + 1, kF, 1, 1, 1);
                const double t_jp2 =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF + 2, kF, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 2, kF, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 2, kF, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF + 2, kF, 1, 1, 1);
                fxy_v = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);

                const double t_km2_x =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 2, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 2, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 2, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 2, 1, 1, 1);
                const double t_km1_x =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 1, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 1, 1, 1, 1);
                const double t_kp1_x =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 1, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 1, 1, 1, 1);
                const double t_kp2_x =
                    fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 2, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 2, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 2, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 2, 1, 1, 1);
                fxz_v = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);

                const double t_km2_y =
                    fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 2, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 2, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 2, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 2, 1, 1, 1);
                const double t_km1_y =
                    fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 1, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 1, 1, 1, 1);
                const double t_kp1_y =
                    fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 1, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 1, 1, 1, 1);
                const double t_kp2_y =
                    fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 2, 1, 1, 1)
                  - 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 2, 1, 1, 1)
                  + 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 2, 1, 1, 1)
                  -       fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 2, 1, 1, 1);
                fyz_v = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
            }
            else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
                     (jF + 1) <= jmaxF && (jF - 1) >= jminF &&
                     (kF + 1) <= kmaxF && (kF - 1) >= kminF)
            {
                const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1);
                fxx_v = d_gp.Sdxdx * (
                    fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1)
                  - 2.0 * c
                  + fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1));
                fyy_v = d_gp.Sdydy * (
                    fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1)
                  - 2.0 * c
                  + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1));
                fzz_v = d_gp.Sdzdz * (
                    fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1)
                  - 2.0 * c
                  + fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1));
                fxy_v = d_gp.Sdxdy * (
                    fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1)
                  - fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1)
                  - fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1)
                  + fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1));
                fxz_v = d_gp.Sdxdz * (
                    fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1)
                  - fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1)
                  - fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1)
                  + fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1));
                fyz_v = d_gp.Sdydz * (
                    fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1)
                  - fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1)
                  - fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
                  + fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
            }
#endif
        }

        /* raised chi/chi */
        double gx=(uxx*cx+uxy*cy+uxz*cz)/c1;
        double gy=(uxy*cx+uyy*cy+uyz*cz)/c1;
        double gz=(uxz*cx+uyz*cy+uzz*cz)/c1;

        /* Christoffel physical correction */
        Gxxx[i]-=((cx+cx)/c1-gxx[i]*gx)*H;
        Gyxx[i]-=(0.0-gxx[i]*gy)*H;
        Gzxx[i]-=(0.0-gxx[i]*gz)*H;
        Gxyy[i]-=(0.0-gyy[i]*gx)*H;
        Gyyy[i]-=((cy+cy)/c1-gyy[i]*gy)*H;
        Gzyy[i]-=(0.0-gyy[i]*gz)*H;
        Gxzz[i]-=(0.0-gzz[i]*gx)*H;
        Gyzz[i]-=(0.0-gzz[i]*gy)*H;
        Gzzz[i]-=((cz+cz)/c1-gzz[i]*gz)*H;
        Gxxy[i]-=(cy/c1-gxy[i]*gx)*H;
        Gyxy[i]-=(cx/c1-gxy[i]*gy)*H;
        Gzxy[i]-=(0.0-gxy[i]*gz)*H;
        Gxxz[i]-=(cz/c1-gxz[i]*gx)*H;
        Gyxz[i]-=(0.0-gxz[i]*gy)*H;
        Gzxz[i]-=(cx/c1-gxz[i]*gz)*H;
        Gxyz_o[i]-=(0.0-gyz[i]*gx)*H;
        Gyyz[i]-=(cz/c1-gyz[i]*gy)*H;
        Gzyz[i]-=(cy/c1-gyz[i]*gz)*H;

        /* Lap second-derivative correction: subtract Gamma*Lap_deriv */
        fxx_v -= Gxxx[i]*lx+Gyxx[i]*ly+Gzxx[i]*lz;
        fyy_v -= Gxyy[i]*lx+Gyyy[i]*ly+Gzyy[i]*lz;
        fzz_v -= Gxzz[i]*lx+Gyzz[i]*ly+Gzzz[i]*lz;
        fxy_v -= Gxxy[i]*lx+Gyxy[i]*ly+Gzxy[i]*lz;
        fxz_v -= Gxxz[i]*lx+Gyxz[i]*ly+Gzxz[i]*lz;
        fyz_v -= Gxyz_o[i]*lx+Gyyz[i]*ly+Gzyz[i]*lz;

        /* D^i D_i alpha */
        double DDA = uxx*fxx_v+uyy*fyy_v+uzz*fzz_v
                   +TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v);

        /* trace of S_ij (physical) */
        double S_v = c1*(uxx*Sxx_m[i]+uyy*Syy_m[i]+uzz*Szz_m[i]
                    +TWO*(uxy*Sxy_m[i]+uxz*Sxz_m[i]+uyz*Syz_m[i]));

        /* A^ij A_ij */
        double AijAij =
            uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
                +TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]))
           +uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
                +TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]))
           +uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
                +TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]))
           +TWO*(
                uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
                    +uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
                    +uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
                    +uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]))
               +uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
                    +uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
                    +uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
                    +uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]))
               +uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
                    +uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
                    +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
                    +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i])));

        double trK_v = trK[i];
        double db = betaxx[i] + betayy[i] + betazz[i];

        /* trK_rhs step 1: store D^iD_i alpha * chin1 */
        trK_rhs[i] = c1 * DDA;

        /* f_arr = -(1/3) * (DDA + alpha/chi * (2/3*K^2 - AijAij - 16pi*rho + 8pi*S)) */
        double f_v = F2o3*trK_v*trK_v - AijAij - F16*PI_V*rho[i] + EIGHT*PI_V*S_v;
        f_arr[i] = -F1o3*(uxx*fxx_v+uyy*fyy_v+uzz*fzz_v
                         +TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v)
                         +(a/c1)*f_v);

        /* fij = alpha*(Rij - 8pi*Sij) - D_iD_j alpha */
        double fij_xx=a*(Rxx[i]-EIGHT*PI_V*Sxx_m[i])-fxx_v;
        double fij_xy=a*(Rxy[i]-EIGHT*PI_V*Sxy_m[i])-fxy_v;
        double fij_xz=a*(Rxz[i]-EIGHT*PI_V*Sxz_m[i])-fxz_v;
        double fij_yy=a*(Ryy[i]-EIGHT*PI_V*Syy_m[i])-fyy_v;
        double fij_yz=a*(Ryz[i]-EIGHT*PI_V*Syz_m[i])-fyz_v;
        double fij_zz=a*(Rzz[i]-EIGHT*PI_V*Szz_m[i])-fzz_v;

        /* Aij_rhs = chi*(fij - gij*f) */
        Axx_rhs[i]=fij_xx-gxx[i]*f_arr[i];
        Ayy_rhs[i]=fij_yy-gyy[i]*f_arr[i];
        Azz_rhs[i]=fij_zz-gzz[i]*f_arr[i];
        Axy_rhs[i]=fij_xy-gxy[i]*f_arr[i];
        Axz_rhs[i]=fij_xz-gxz[i]*f_arr[i];
        Ayz_rhs[i]=fij_yz-gyz[i]*f_arr[i];

        /* A_il A^l_j */
        double AA_xx=uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
                    +TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]);
        double AA_yy=uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
                    +TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]);
        double AA_zz=uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
                    +TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]);
        double AA_xy=uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
                    +uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
                    +uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
                    +uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]);
        double AA_xz=uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
                    +uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
                    +uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
                    +uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]);
        double AA_yz=uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
                    +uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
                    +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
                    +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]);

        /* trK_rhs final */
        trK_rhs[i] = -trK_rhs[i]
            + a*(F1o3*trK_v*trK_v
                +uxx*AA_xx+uyy*AA_yy+uzz*AA_zz
                +TWO*(uxy*AA_xy+uxz*AA_xz+uyz*AA_yz)
                +FOUR*PI_V*(rho[i]+S_v));

        /* Aij_rhs final */
        Axx_rhs[i]=c1*Axx_rhs[i]+a*(trK_v*Axx[i]-TWO*AA_xx)
            +TWO*(Axx[i]*betaxx[i]+Axy[i]*betayx[i]+Axz[i]*betazx[i])-F2o3*Axx[i]*db;
        Ayy_rhs[i]=c1*Ayy_rhs[i]+a*(trK_v*Ayy[i]-TWO*AA_yy)
            +TWO*(Axy[i]*betaxy[i]+Ayy[i]*betayy[i]+Ayz[i]*betazy[i])-F2o3*Ayy[i]*db;
        Azz_rhs[i]=c1*Azz_rhs[i]+a*(trK_v*Azz[i]-TWO*AA_zz)
            +TWO*(Axz[i]*betaxz[i]+Ayz[i]*betayz[i]+Azz[i]*betazz[i])-F2o3*Azz[i]*db;
        Axy_rhs[i]=c1*Axy_rhs[i]+a*(trK_v*Axy[i]-TWO*AA_xy)
            +Axx[i]*betaxy[i]+Axz[i]*betazy[i]+Ayy[i]*betayx[i]
            +Ayz[i]*betazx[i]+F1o3*Axy[i]*db-Axy[i]*betazz[i];
        Ayz_rhs[i]=c1*Ayz_rhs[i]+a*(trK_v*Ayz[i]-TWO*AA_yz)
            +Axy[i]*betaxz[i]+Ayy[i]*betayz[i]+Axz[i]*betaxy[i]
            +Azz[i]*betazy[i]+F1o3*Ayz[i]*db-Ayz[i]*betaxx[i];
        Axz_rhs[i]=c1*Axz_rhs[i]+a*(trK_v*Axz[i]-TWO*AA_xz)
            +Axx[i]*betaxz[i]+Axy[i]*betayz[i]+Ayz[i]*betayx[i]
            +Azz[i]*betazx[i]+F1o3*Axz[i]*db-Axz[i]*betayy[i];

        /* gauge */
        Lap_rhs[i] = -TWO*a*trK_v;
        betax_rhs[i] = 0.75*dtSfx[i];
        betay_rhs[i] = 0.75*dtSfy[i];
        betaz_rhs[i] = 0.75*dtSfz[i];
#if (GAUGE == 0)
        dtSfx_rhs[i] = Gamx_rhs[i] - 2.0*dtSfx[i];
        dtSfy_rhs[i] = Gamy_rhs[i] - 2.0*dtSfy[i];
        dtSfz_rhs[i] = Gamz_rhs[i] - 2.0*dtSfz[i];
#endif
    }
}

/* Phase 18: Hamilton & momentum constraints (co==0 only) */
__global__ __launch_bounds__(128, 4)
void kern_phase18_constraints(
    const double* __restrict__ chin1,
    const double* __restrict__ chix, const double* __restrict__ chiy,
    const double* __restrict__ chiz,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ trK,
    const double* __restrict__ Axx, const double* __restrict__ Axy,
    const double* __restrict__ Axz, const double* __restrict__ Ayy,
    const double* __restrict__ Ayz, const double* __restrict__ Azz,
    const double* __restrict__ Rxx, const double* __restrict__ Rxy,
    const double* __restrict__ Rxz, const double* __restrict__ Ryy,
    const double* __restrict__ Ryz, const double* __restrict__ Rzz,
    const double* __restrict__ rho,
    const double* __restrict__ Sx_m, const double* __restrict__ Sy_m,
    const double* __restrict__ Sz_m,
    const double* __restrict__ Kx, const double* __restrict__ Ky,
    const double* __restrict__ Kz,
    const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
    const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
    const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
    const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
    const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
    const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
    const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
    const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
    const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
    /* dA/dx arrays (fderivs of Aij) */
    const double* __restrict__ dAxx_x, const double* __restrict__ dAxx_y,
    const double* __restrict__ dAxx_z,
    const double* __restrict__ dAxy_x, const double* __restrict__ dAxy_y,
    const double* __restrict__ dAxy_z,
    const double* __restrict__ dAxz_x, const double* __restrict__ dAxz_y,
    const double* __restrict__ dAxz_z,
    const double* __restrict__ dAyy_x, const double* __restrict__ dAyy_y,
    const double* __restrict__ dAyy_z,
    const double* __restrict__ dAyz_x, const double* __restrict__ dAyz_y,
    const double* __restrict__ dAyz_z,
    const double* __restrict__ dAzz_x, const double* __restrict__ dAzz_y,
    const double* __restrict__ dAzz_z,
    double* __restrict__ ham_Res,
    double* __restrict__ movx_Res, double* __restrict__ movy_Res,
    double* __restrict__ movz_Res)
{
    const double TWO=2.0, F2o3=2.0/3.0, F8=8.0, F16=16.0;
    const double PI_V=3.14159265358979323846;
    for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all;
         i += blockDim.x*gridDim.x)
    {
        double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
        double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
        double c1=chin1[i];

        /* Hamiltonian constraint */
        double R_sc = uxx*Rxx[i]+uyy*Ryy[i]+uzz*Rzz[i]
                    +TWO*(uxy*Rxy[i]+uxz*Rxz[i]+uyz*Ryz[i]);
        /* AijAij (same as in phase15) */
        double AijAij =
            uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
                +TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]))
           +uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
                +TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]))
           +uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
                +TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]))
           +TWO*(uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
                      +uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
                      +uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
                      +uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]))
                 +uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
                      +uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
                      +uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
                      +uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]))
                 +uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
                      +uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
                      +uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
                      +uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i])));

        ham_Res[i] = c1*R_sc + F2o3*trK[i]*trK[i] - AijAij - F16*PI_V*rho[i];

        /* Momentum constraints: need covariant derivative of A */
        double cx=chix[i],cy=chiy[i],cz=chiz[i];
        /* D_j A^j_x etc — subtract Christoffel and chi terms */
        /* gxxx = dAxx_x - 2*Gxxx*Axx - ... - chix*Axx/chin1 etc */
        double mx_xx = dAxx_x[i]-(Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i]
                                  +Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i])-cx*Axx[i]/c1;
        double mx_xy = dAxy_x[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i]
                                  +Gxxx[i]*Axy[i]+Gyxx[i]*Ayy[i]+Gzxx[i]*Ayz[i])-cx*Axy[i]/c1;
        double mx_xz = dAxz_x[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i]
                                  +Gxxx[i]*Axz[i]+Gyxx[i]*Ayz[i]+Gzxx[i]*Azz[i])-cx*Axz[i]/c1;
        double mx_yy = dAyy_x[i]-(Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i]
                                  +Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cx*Ayy[i]/c1;
        double mx_yz = dAyz_x[i]-(Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i]
                                  +Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cx*Ayz[i]/c1;
        double mx_zz = dAzz_x[i]-(Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i]
                                  +Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cx*Azz[i]/c1;

        double my_xx = dAxx_y[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i]
                                  +Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i])-cy*Axx[i]/c1;
        double my_xy = dAxy_y[i]-(Gxyy[i]*Axx[i]+Gyyy[i]*Axy[i]+Gzyy[i]*Axz[i]
                                  +Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cy*Axy[i]/c1;
        double my_xz = dAxz_y[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i]
                                  +Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cy*Axz[i]/c1;
        double my_yy = dAyy_y[i]-(Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i]
                                  +Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i])-cy*Ayy[i]/c1;
        double my_yz = dAyz_y[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i]
                                  +Gxyy[i]*Axz[i]+Gyyy[i]*Ayz[i]+Gzyy[i]*Azz[i])-cy*Ayz[i]/c1;
        double my_zz = dAzz_y[i]-(Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i]
                                  +Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cy*Azz[i]/c1;

        double mz_xx = dAxx_z[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i]
                                  +Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i])-cz*Axx[i]/c1;
        double mz_xy = dAxy_z[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i]
                                  +Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i])-cz*Axy[i]/c1;
        double mz_xz = dAxz_z[i]-(Gxzz[i]*Axx[i]+Gyzz[i]*Axy[i]+Gzzz[i]*Axz[i]
                                  +Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cz*Axz[i]/c1;
        double mz_yy = dAyy_z[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i]
                                  +Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i])-cz*Ayy[i]/c1;
        double mz_yz = dAyz_z[i]-(Gxzz[i]*Axy[i]+Gyzz[i]*Ayy[i]+Gzzz[i]*Ayz[i]
                                  +Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cz*Ayz[i]/c1;
        double mz_zz = dAzz_z[i]-(Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i]
                                  +Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i])-cz*Azz[i]/c1;

        movx_Res[i] = uxx*mx_xx+uyy*my_xy+uzz*mz_xz
                     +uxy*mx_xy+uxz*mx_xz+uyz*my_xz
                     +uxy*my_xx+uxz*mz_xx+uyz*mz_xy
                     - F2o3*Kx[i] - F8*PI_V*Sx_m[i];
        movy_Res[i] = uxx*mx_xy+uyy*my_yy+uzz*mz_yz
                     +uxy*mx_yy+uxz*mx_yz+uyz*my_yz
                     +uxy*my_xy+uxz*mz_xy+uyz*mz_yy
                     - F2o3*Ky[i] - F8*PI_V*Sy_m[i];
        movz_Res[i] = uxx*mx_xz+uyy*my_yz+uzz*mz_zz
                     +uxy*mx_yz+uxz*mx_zz+uyz*my_zz
                     +uxy*my_xz+uxz*mz_xz+uyz*mz_yz
                     - F2o3*Kz[i] - F8*PI_V*Sz_m[i];
    }
}

__global__ __launch_bounds__(128, 4)
void kern_escalar_constraint_fr(
    const double* __restrict__ chin1,
    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
    const double* __restrict__ trK,
    const double* __restrict__ Axx, const double* __restrict__ Axy,
    const double* __restrict__ Axz, const double* __restrict__ Ayy,
    const double* __restrict__ Ayz, const double* __restrict__ Azz,
    const double* __restrict__ Rxx, const double* __restrict__ Rxy,
    const double* __restrict__ Rxz, const double* __restrict__ Ryy,
    const double* __restrict__ Ryz, const double* __restrict__ Rzz,
    const double* __restrict__ rho,
    const double* __restrict__ Sxx, const double* __restrict__ Sxy,
    const double* __restrict__ Sxz, const double* __restrict__ Syy,
    const double* __restrict__ Syz, const double* __restrict__ Szz,
    const double* __restrict__ Sphi,
    double a2,
    double* __restrict__ Cons_fR)
{
    const double TWO = 2.0;
    const double F2o3 = 2.0 / 3.0;
    const double F8 = 8.0;
    const double PI_V = 3.14159265358979323846;
    const double SQRT3OPI_OVER4 = 0.25 * sqrt(3.0 / PI_V);

    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all;
         i += blockDim.x * gridDim.x)
    {
        const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
        const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
        const double c1 = chin1[i];

        const double AijAij =
            uxx * (uxx * Axx[i] * Axx[i] + uyy * Axy[i] * Axy[i] + uzz * Axz[i] * Axz[i]
                 + TWO * (uxy * Axx[i] * Axy[i] + uxz * Axx[i] * Axz[i] + uyz * Axy[i] * Axz[i]))
          + uyy * (uxx * Axy[i] * Axy[i] + uyy * Ayy[i] * Ayy[i] + uzz * Ayz[i] * Ayz[i]
                 + TWO * (uxy * Axy[i] * Ayy[i] + uxz * Axy[i] * Ayz[i] + uyz * Ayy[i] * Ayz[i]))
          + uzz * (uxx * Axz[i] * Axz[i] + uyy * Ayz[i] * Ayz[i] + uzz * Azz[i] * Azz[i]
                 + TWO * (uxy * Axz[i] * Ayz[i] + uxz * Axz[i] * Azz[i] + uyz * Ayz[i] * Azz[i]))
          + TWO * (uxy * (uxx * Axx[i] * Axy[i] + uyy * Axy[i] * Ayy[i] + uzz * Axz[i] * Ayz[i]
                         + uxy * (Axx[i] * Ayy[i] + Axy[i] * Axy[i])
                         + uxz * (Axx[i] * Ayz[i] + Axz[i] * Axy[i])
                         + uyz * (Axy[i] * Ayz[i] + Axz[i] * Ayy[i]))
                 + uxz * (uxx * Axx[i] * Axz[i] + uyy * Axy[i] * Ayz[i] + uzz * Axz[i] * Azz[i]
                         + uxy * (Axx[i] * Ayz[i] + Axy[i] * Axz[i])
                         + uxz * (Axx[i] * Azz[i] + Axz[i] * Axz[i])
                         + uyz * (Axy[i] * Azz[i] + Axz[i] * Ayz[i]))
                 + uyz * (uxx * Axy[i] * Axz[i] + uyy * Ayy[i] * Ayz[i] + uzz * Ayz[i] * Azz[i]
                         + uxy * (Axy[i] * Ayz[i] + Ayy[i] * Axz[i])
                         + uxz * (Axy[i] * Azz[i] + Ayz[i] * Axz[i])
                         + uyz * (Ayy[i] * Azz[i] + Ayz[i] * Ayz[i])));

        const double R_sc = uxx * Rxx[i] + uyy * Ryy[i] + uzz * Rzz[i]
                          + TWO * (uxy * Rxy[i] + uxz * Rxz[i] + uyz * Ryz[i]);
        const double trS = uxx * Sxx[i] + uyy * Syy[i] + uzz * Szz[i]
                         + TWO * (uxy * Sxy[i] + uxz * Sxz[i] + uyz * Syz[i]);
        const double RR = AijAij - F2o3 * trK[i] * trK[i]
                        - R_sc * c1
                        - F8 * PI_V * (3.0 * rho[i] - trS * c1);
        const double fprim = 1.0 + 2.0 * a2 * RR;
        Cons_fR[i] = Sphi[i] - SQRT3OPI_OVER4 * log(fprim);
    }
}

static void setup_grid_params(int *ex,
                              double *X, double *Y, double *Z,
                              int Symmetry, double eps, int co)
{
    const int nx = ex[0];
    const int ny = ex[1];
    const int nz = ex[2];
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int NO_SYMM = 0;
    const int EQ_SYMM = 1;

    ensure_gpu_buffers(nx, ny, nz);

    GridParams gp = {};
    gp.ex[0] = nx;
    gp.ex[1] = ny;
    gp.ex[2] = nz;
    gp.all = nx * ny * nz;
    gp.dX = dX;
    gp.dY = dY;
    gp.dZ = dZ;
    gp.d12dx = 1.0 / (12.0 * dX);
    gp.d12dy = 1.0 / (12.0 * dY);
    gp.d12dz = 1.0 / (12.0 * dZ);
    gp.d2dx = 1.0 / (2.0 * dX);
    gp.d2dy = 1.0 / (2.0 * dY);
    gp.d2dz = 1.0 / (2.0 * dZ);
    gp.Fdxdx = 1.0 / (12.0 * dX * dX);
    gp.Fdydy = 1.0 / (12.0 * dY * dY);
    gp.Fdzdz = 1.0 / (12.0 * dZ * dZ);
    gp.Sdxdx = 1.0 / (dX * dX);
    gp.Sdydy = 1.0 / (dY * dY);
    gp.Sdzdz = 1.0 / (dZ * dZ);
    gp.Fdxdy = 1.0 / (144.0 * dX * dY);
    gp.Fdxdz = 1.0 / (144.0 * dX * dZ);
    gp.Fdydz = 1.0 / (144.0 * dY * dZ);
    gp.Sdxdy = 0.25 / (dX * dY);
    gp.Sdxdz = 0.25 / (dX * dZ);
    gp.Sdydz = 0.25 / (dY * dZ);
    gp.iminF = 1;
    gp.jminF = 1;
    gp.kminF = 1;
    gp.imaxF = nx;
    gp.jmaxF = ny;
    gp.kmaxF = nz;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = 2 - ghost_width;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = 2 - ghost_width;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = 2 - ghost_width;
    gp.iminF3 = 1;
    gp.jminF3 = 1;
    gp.kminF3 = 1;
    if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = 1 - ghost_width;
    if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = 1 - ghost_width;
    if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = 1 - ghost_width;
    gp.Symmetry = Symmetry;
    gp.eps = eps;
    gp.co = co;
    gp.fh2_nx = nx + 2;
    gp.fh2_ny = ny + 2;
    gp.fh2_nz = nz + 2;
    gp.fh3_nx = nx + 3;
    gp.fh3_ny = ny + 3;
    gp.fh3_nz = nz + 3;
    upload_grid_params_if_needed(gp);
}

static void compute_patch_boundary_flags(int *ex,
                                         double *X, double *Y, double *Z,
                                         const double *bbox,
                                         int Symmetry,
                                         int &touch_xmin, int &touch_xmax,
                                         int &touch_ymin, int &touch_ymax,
                                         int &touch_zmin, int &touch_zmax)
{
    const double dX = X[1] - X[0];
    const double dY = Y[1] - Y[0];
    const double dZ = Z[1] - Z[0];
    const int NO_SYMM = 0;
    const int OCTANT = 2;

    touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0;
    touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0;
    touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0;

    touch_xmin = (std::fabs(X[0] - bbox[0]) < dX &&
                  !(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0;
    touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY &&
                  !(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0;
    touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ &&
                  !(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0;
}

static void upload_state_inputs(double **state_host, size_t all)
{
    const size_t bytes = all * sizeof(double);
    const bool profile = cuda_profile_enabled();
    const double t0 = profile ? cuda_profile_now_ms() : 0.0;
    static int direct_upload = -1;
    if (direct_upload < 0) {
        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
        direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    if (direct_upload) {
        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
            CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
                                       bytes, cudaMemcpyHostToDevice));
        }
        if (profile) {
            cuda_profile_sync();
            CudaProfileStats &stats = cuda_profile_stats();
            stats.upload_calls++;
            stats.upload_ms += cuda_profile_now_ms() - t0;
            stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
        }
        return;
    }
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
    }
    CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
                          (size_t)BSSN_STATE_COUNT * bytes,
                          cudaMemcpyHostToDevice));
    if (profile) {
        CudaProfileStats &stats = cuda_profile_stats();
        stats.upload_calls++;
        stats.upload_ms += cuda_profile_now_ms() - t0;
        stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
    }
}

static void upload_matter_cache(StepContext &ctx,
                                double **matter_host,
                                size_t all)
{
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
    }
    CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
                          (size_t)BSSN_MATTER_COUNT * bytes,
                          cudaMemcpyHostToDevice));
    ctx.matter_ready = true;
}

static void zero_matter_cache(StepContext &ctx, size_t all)
{
    CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
                          (size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
    ctx.matter_ready = true;
}

static void bind_matter_slots(const StepContext &ctx)
{
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
    }
}

static void bind_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
{
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        g_buf.slot[k_state_input_slots[i]] = state[i];
    }
}

static void bind_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
{
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        g_buf.slot[k_state_rhs_slots[i]] = state[i];
    }
}

static void bind_escalar_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
{
    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
        g_buf.slot[k_escalar_state_input_slots[i]] = state[i];
    }
}

static void bind_escalar_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
{
    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
        g_buf.slot[k_escalar_state_rhs_slots[i]] = state[i];
    }
}

static void bind_em_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
{
    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
        g_buf.slot[k_em_state_input_slots[i]] = state[i];
    }
}

static void bind_em_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
{
    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
        g_buf.slot[k_em_state_rhs_slots[i]] = state[i];
    }
}

static void upload_escalar_state_inputs(double **state_host, size_t all);
static void upload_em_state_inputs(double **state_host, size_t all);

static bool resident_key_matches_count(const StepContext &ctx, int bank, double **host_key, int state_count)
{
    if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
        return false;
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return false;
    for (int i = 0; i < state_count; ++i) {
        if (!host_key[i] || ctx.resident_host[bank][i] != host_key[i])
            return false;
    }
    return true;
}

static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
{
    return resident_key_matches_count(ctx, bank, host_key, BSSN_STATE_COUNT);
}

static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count)
{
    if (!host_key) return -1;
    int best = -1;
    unsigned long long best_age = 0;
    int best_invalid = -1;
    unsigned long long best_invalid_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (!resident_key_matches_count(ctx, b, host_key, state_count))
            continue;
        if (ctx.resident_valid[b]) {
            if (best < 0 || ctx.resident_age[b] > best_age) {
                best = b;
                best_age = ctx.resident_age[b];
            }
        } else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
            best_invalid = b;
            best_invalid_age = ctx.resident_age[b];
        }
    }
    return (best >= 0) ? best : best_invalid;
}

static int find_resident_bank_subset(const StepContext &ctx,
                                     double **host_key,
                                     const int *state_indices,
                                     int subset_count)
{
    if (!host_key || !state_indices || subset_count <= 0)
        return -1;
    int best = -1;
    unsigned long long best_age = 0;
    int best_invalid = -1;
    unsigned long long best_invalid_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        bool match = true;
        for (int i = 0; i < subset_count; ++i) {
            const int state_index = state_indices[i];
            if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY ||
                !host_key[i] ||
                ctx.resident_host[b][state_index] != host_key[i]) {
                match = false;
                break;
            }
        }
        if (!match)
            continue;
        if (ctx.resident_valid[b]) {
            if (best < 0 || ctx.resident_age[b] > best_age) {
                best = b;
                best_age = ctx.resident_age[b];
            }
        } else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
            best_invalid = b;
            best_invalid_age = ctx.resident_age[b];
        }
    }
    return (best >= 0) ? best : best_invalid;
}

static int find_resident_bank(const StepContext &ctx, double **host_key)
{
    return find_resident_bank_count(ctx, host_key, BSSN_STATE_COUNT);
}

static bool resident_key_usable_count(double **host_key, int state_count)
{
    if (!host_key) return false;
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return false;
    for (int i = 0; i < state_count; ++i) {
        if (!host_key[i]) return false;
    }
    return true;
}

static bool resident_key_usable(double **host_key)
{
    return resident_key_usable_count(host_key, BSSN_STATE_COUNT);
}

static void set_resident_host_clean(StepContext &ctx, int bank, bool clean)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
    ctx.resident_host_clean[bank].fill(clean ? 1 : 0);
}

static bool resident_host_subset_clean(const StepContext &ctx,
                                       int bank,
                                       int subset_count,
                                       const int *state_indices)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return false;
    for (int i = 0; i < subset_count; ++i) {
        const int state_index = state_indices ? state_indices[i] : i;
        if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY)
            return false;
        if (!ctx.resident_host_clean[bank][state_index])
            return false;
    }
    return true;
}

static void mark_resident_host_subset_clean(StepContext &ctx,
                                            int bank,
                                            int subset_count,
                                            const int *state_indices,
                                            bool clean)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
    for (int i = 0; i < subset_count; ++i) {
        const int state_index = state_indices ? state_indices[i] : i;
        if (state_index >= 0 && state_index < BSSN_RESIDENT_STATE_CAPACITY)
            ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
    }
}

static void mark_resident_host_state_clean(StepContext &ctx,
                                           int bank,
                                           int state_index,
                                           bool clean)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
    if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
    ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
}

static void mark_resident_current_bank(StepContext &ctx, int bank)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
    ctx.current_bank = bank;
    ctx.d_state_curr_mem = ctx.d_resident_mem[bank];
    ctx.d_state_curr = ctx.d_resident[bank];
    ctx.state_ready = ctx.resident_valid[bank];
}

static void mark_resident_next_bank(StepContext &ctx, int bank)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
    ctx.d_state_next_mem = ctx.d_resident_mem[bank];
    ctx.d_state_next = ctx.d_resident[bank];
}

static bool any_resident_bank_valid(const StepContext &ctx)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (ctx.resident_valid[b]) return true;
    }
    return false;
}

static void update_state_ready(StepContext &ctx)
{
    ctx.state_ready = any_resident_bank_valid(ctx);
}

static void writeback_resident_bank_count(StepContext &ctx, int bank, size_t all, int state_count)
{
    if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
    if (!ctx.resident_valid[bank]) return;
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    for (int i = 0; i < state_count; ++i) {
        if (!ctx.resident_host[bank][i]) return;
    }
    const bool profile = cuda_aux_profile_enabled();
    const double t0 = profile ? cuda_profile_now_ms() : 0.0;
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < state_count; ++i) {
        CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i],
                                   ctx.d_resident[bank][i],
                                   bytes, cudaMemcpyDeviceToHost));
    }
    CUDA_CHECK(cudaDeviceSynchronize());
    set_resident_host_clean(ctx, bank, true);
    if (profile) {
        CudaAuxProfileStats &stats = cuda_aux_profile_stats();
        stats.writeback_calls++;
        stats.writeback_ms += cuda_profile_now_ms() - t0;
        stats.writeback_gb += (double)((size_t)state_count * bytes) / 1.0e9;
        cuda_aux_profile_maybe_log();
    }
}

static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
{
    writeback_resident_bank_count(ctx, bank, all, BSSN_STATE_COUNT);
}

static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (b != avoid_bank && !ctx.resident_valid[b])
            return b;
    }

    int best = -1;
    unsigned long long best_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (b == avoid_bank) continue;
        if (best < 0 || ctx.resident_age[b] < best_age) {
            best = b;
            best_age = ctx.resident_age[b];
        }
    }
    if (best < 0) best = 0;
    writeback_resident_bank(ctx, best, all);
    ctx.resident_valid[best] = false;
    ctx.resident_host[best].fill(nullptr);
    ctx.resident_host_clean[best].fill(0);
    ctx.resident_age[best] = 0;
    if (ctx.current_bank == best) {
        ctx.current_bank = -1;
        ctx.d_state_curr_mem = nullptr;
        ctx.d_state_curr.fill(nullptr);
    }
    update_state_ready(ctx);
    return best;
}

static int choose_escalar_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (b != avoid_bank && !ctx.resident_valid[b])
            return b;
    }

    int best = -1;
    unsigned long long best_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (b == avoid_bank) continue;
        if (best < 0 || ctx.resident_age[b] < best_age) {
            best = b;
            best_age = ctx.resident_age[b];
        }
    }
    if (best < 0) best = 0;
    writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
    ctx.resident_valid[best] = false;
    ctx.resident_host[best].fill(nullptr);
    ctx.resident_host_clean[best].fill(0);
    ctx.resident_age[best] = 0;
    if (ctx.current_bank == best) {
        ctx.current_bank = -1;
        ctx.d_state_curr_mem = nullptr;
        ctx.d_state_curr.fill(nullptr);
    }
    update_state_ready(ctx);
    return best;
}

static int choose_em_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (b != avoid_bank && !ctx.resident_valid[b])
            return b;
    }

    int best = -1;
    unsigned long long best_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (b == avoid_bank) continue;
        if (best < 0 || ctx.resident_age[b] < best_age) {
            best = b;
            best_age = ctx.resident_age[b];
        }
    }
    if (best < 0) best = 0;
    writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT);
    ctx.resident_valid[best] = false;
    ctx.resident_host[best].fill(nullptr);
    ctx.resident_host_clean[best].fill(0);
    ctx.resident_age[best] = 0;
    if (ctx.current_bank == best) {
        ctx.current_bank = -1;
        ctx.d_state_curr_mem = nullptr;
        ctx.d_state_curr.fill(nullptr);
    }
    update_state_ready(ctx);
    return best;
}

static void assign_resident_key_count(StepContext &ctx, int bank, double **host_key, int state_count)
{
    for (int i = 0; i < state_count; ++i) {
        ctx.resident_host[bank][i] = host_key[i];
    }
    set_resident_host_clean(ctx, bank, false);
    ctx.resident_age[bank] = ++ctx.resident_clock;
}

static void assign_resident_key(StepContext &ctx, int bank, double **host_key)
{
    assign_resident_key_count(ctx, bank, host_key, BSSN_STATE_COUNT);
}

static int ensure_resident_bank(StepContext &ctx,
                                double **host_key,
                                size_t all,
                                bool upload_if_missing,
                                int avoid_bank = -1)
{
    if (!resident_key_usable(host_key)) {
        if (ctx.current_bank >= 0)
            return ctx.current_bank;
        return 0;
    }

    int bank = find_resident_bank(ctx, host_key);
    if (bank >= 0) {
        ctx.resident_age[bank] = ++ctx.resident_clock;
        if (!ctx.resident_valid[bank] && upload_if_missing) {
            bind_state_input_slots(ctx.d_resident[bank]);
            upload_state_inputs(host_key, all);
            ctx.resident_valid[bank] = true;
            set_resident_host_clean(ctx, bank, true);
        }
        return bank;
    }

    bank = choose_resident_bank_for_reuse(ctx, avoid_bank, all);
    assign_resident_key(ctx, bank, host_key);
    if (upload_if_missing) {
        bind_state_input_slots(ctx.d_resident[bank]);
        upload_state_inputs(host_key, all);
        ctx.resident_valid[bank] = true;
        set_resident_host_clean(ctx, bank, true);
    } else {
        ctx.resident_valid[bank] = false;
        set_resident_host_clean(ctx, bank, false);
    }
    update_state_ready(ctx);
    return bank;
}

static int ensure_escalar_resident_bank(StepContext &ctx,
                                        double **host_key,
                                        size_t all,
                                        bool upload_if_missing,
                                        int avoid_bank = -1)
{
    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
        if (ctx.current_bank >= 0)
            return ctx.current_bank;
        return 0;
    }

    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
    if (bank >= 0) {
        ctx.resident_age[bank] = ++ctx.resident_clock;
        if (!ctx.resident_valid[bank] && upload_if_missing) {
            bind_escalar_state_input_slots(ctx.d_resident[bank]);
            upload_escalar_state_inputs(host_key, all);
            CUDA_CHECK(cudaDeviceSynchronize());
            ctx.resident_valid[bank] = true;
            set_resident_host_clean(ctx, bank, true);
        }
        return bank;
    }

    bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_bank, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
    if (upload_if_missing) {
        bind_escalar_state_input_slots(ctx.d_resident[bank]);
        upload_escalar_state_inputs(host_key, all);
        CUDA_CHECK(cudaDeviceSynchronize());
        ctx.resident_valid[bank] = true;
        set_resident_host_clean(ctx, bank, true);
    } else {
        ctx.resident_valid[bank] = false;
        set_resident_host_clean(ctx, bank, false);
    }
    update_state_ready(ctx);
    return bank;
}

static int ensure_em_resident_bank(StepContext &ctx,
                                   double **host_key,
                                   size_t all,
                                   bool upload_if_missing,
                                   int avoid_bank = -1)
{
    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
        if (ctx.current_bank >= 0)
            return ctx.current_bank;
        return 0;
    }

    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
    if (bank >= 0) {
        ctx.resident_age[bank] = ++ctx.resident_clock;
        if (!ctx.resident_valid[bank] && upload_if_missing) {
            bind_em_state_input_slots(ctx.d_resident[bank]);
            upload_em_state_inputs(host_key, all);
            CUDA_CHECK(cudaDeviceSynchronize());
            ctx.resident_valid[bank] = true;
            set_resident_host_clean(ctx, bank, true);
        }
        return bank;
    }

    bank = choose_em_resident_bank_for_reuse(ctx, avoid_bank, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
    if (upload_if_missing) {
        bind_em_state_input_slots(ctx.d_resident[bank]);
        upload_em_state_inputs(host_key, all);
        CUDA_CHECK(cudaDeviceSynchronize());
        ctx.resident_valid[bank] = true;
        set_resident_host_clean(ctx, bank, true);
    } else {
        ctx.resident_valid[bank] = false;
        set_resident_host_clean(ctx, bank, false);
    }
    update_state_ready(ctx);
    return bank;
}

static int reserve_resident_output_bank(StepContext &ctx,
                                        double **host_key,
                                        size_t all,
                                        int input_bank)
{
    if (!resident_key_usable(host_key))
        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
    if (resident_key_matches(ctx, input_bank, host_key))
        return input_bank;

    int bank = find_resident_bank(ctx, host_key);
    if (bank < 0)
        bank = choose_resident_bank_for_reuse(ctx, input_bank, all);
    assign_resident_key(ctx, bank, host_key);
    ctx.resident_valid[bank] = false;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    update_state_ready(ctx);
    return bank;
}

static int reserve_escalar_resident_output_bank(StepContext &ctx,
                                                double **host_key,
                                                size_t all,
                                                int input_bank)
{
    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT))
        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
    if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_ESCALAR_STATE_COUNT))
        return input_bank;

    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
    if (bank < 0)
        bank = choose_escalar_resident_bank_for_reuse(ctx, input_bank, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
    ctx.resident_valid[bank] = false;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    update_state_ready(ctx);
    return bank;
}

static int reserve_em_resident_output_bank(StepContext &ctx,
                                           double **host_key,
                                           size_t all,
                                           int input_bank)
{
    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT))
        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
    if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_EM_STATE_COUNT))
        return input_bank;

    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
    if (bank < 0)
        bank = choose_em_resident_bank_for_reuse(ctx, input_bank, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
    ctx.resident_valid[bank] = false;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    update_state_ready(ctx);
    return bank;
}

static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c);

static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
                                                           int avoid_a,
                                                           int avoid_b,
                                                           int avoid_c,
                                                           size_t all);

static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx,
                                                       int avoid_a,
                                                       int avoid_b,
                                                       int avoid_c,
                                                       size_t all);

static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
                                                         double **host_key,
                                                         size_t all,
                                                         int avoid_a,
                                                         int avoid_b,
                                                         int avoid_c)
{
    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT))
        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
    if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_ESCALAR_STATE_COUNT))
        return avoid_a;
    if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_ESCALAR_STATE_COUNT))
        return avoid_b;
    if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_ESCALAR_STATE_COUNT))
        return avoid_c;

    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
    if (bank < 0) {
        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
            if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) {
                bank = b;
                break;
            }
        }
    }
    if (bank < 0)
        bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
    ctx.resident_valid[bank] = false;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    update_state_ready(ctx);
    return bank;
}

static int reserve_em_resident_output_bank_avoiding(StepContext &ctx,
                                                    double **host_key,
                                                    size_t all,
                                                    int avoid_a,
                                                    int avoid_b,
                                                    int avoid_c)
{
    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT))
        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
    if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_EM_STATE_COUNT))
        return avoid_a;
    if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_EM_STATE_COUNT))
        return avoid_b;
    if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_EM_STATE_COUNT))
        return avoid_c;

    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
    if (bank < 0) {
        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
            if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) {
                bank = b;
                break;
            }
        }
    }
    if (bank < 0)
        bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
    ctx.resident_valid[bank] = false;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    update_state_ready(ctx);
    return bank;
}

static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c)
{
    return bank == avoid_a || bank == avoid_b || bank == avoid_c;
}

static int choose_resident_bank_for_reuse_avoiding(StepContext &ctx,
                                                   int avoid_a,
                                                   int avoid_b,
                                                   int avoid_c,
                                                   size_t all)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
            return b;
    }

    int best = -1;
    unsigned long long best_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
        if (best < 0 || ctx.resident_age[b] < best_age) {
            best = b;
            best_age = ctx.resident_age[b];
        }
    }
    if (best < 0)
        return choose_resident_bank_for_reuse(ctx, avoid_a, all);

    writeback_resident_bank(ctx, best, all);
    ctx.resident_valid[best] = false;
    ctx.resident_host[best].fill(nullptr);
    ctx.resident_host_clean[best].fill(0);
    ctx.resident_age[best] = 0;
    if (ctx.current_bank == best) {
        ctx.current_bank = -1;
        ctx.d_state_curr_mem = nullptr;
        ctx.d_state_curr.fill(nullptr);
    }
    update_state_ready(ctx);
    return best;
}

static int reserve_resident_output_bank_avoiding(StepContext &ctx,
                                                 double **host_key,
                                                 size_t all,
                                                 int avoid_a,
                                                 int avoid_b,
                                                 int avoid_c)
{
    if (!resident_key_usable(host_key))
        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
    if (resident_key_matches(ctx, avoid_a, host_key))
        return avoid_a;
    if (resident_key_matches(ctx, avoid_b, host_key))
        return avoid_b;
    if (resident_key_matches(ctx, avoid_c, host_key))
        return avoid_c;

    int bank = find_resident_bank(ctx, host_key);
    if (bank < 0)
        bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
    assign_resident_key(ctx, bank, host_key);
    ctx.resident_valid[bank] = false;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    update_state_ready(ctx);
    return bank;
}

static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
                                                           int avoid_a,
                                                           int avoid_b,
                                                           int avoid_c,
                                                           size_t all)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
            return b;
    }

    int best = -1;
    unsigned long long best_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
        if (best < 0 || ctx.resident_age[b] < best_age) {
            best = b;
            best_age = ctx.resident_age[b];
        }
    }
    if (best < 0)
        return choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all);

    writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
    ctx.resident_valid[best] = false;
    ctx.resident_host[best].fill(nullptr);
    ctx.resident_host_clean[best].fill(0);
    ctx.resident_age[best] = 0;
    if (ctx.current_bank == best) {
        ctx.current_bank = -1;
        ctx.d_state_curr_mem = nullptr;
        ctx.d_state_curr.fill(nullptr);
    }
    update_state_ready(ctx);
    return best;
}

static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx,
                                                       int avoid_a,
                                                       int avoid_b,
                                                       int avoid_c,
                                                       size_t all)
{
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
            return b;
    }

    int best = -1;
    unsigned long long best_age = 0;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
        if (best < 0 || ctx.resident_age[b] < best_age) {
            best = b;
            best_age = ctx.resident_age[b];
        }
    }
    if (best < 0)
        return choose_em_resident_bank_for_reuse(ctx, avoid_a, all);

    writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT);
    ctx.resident_valid[best] = false;
    ctx.resident_host[best].fill(nullptr);
    ctx.resident_host_clean[best].fill(0);
    ctx.resident_age[best] = 0;
    if (ctx.current_bank == best) {
        ctx.current_bank = -1;
        ctx.d_state_curr_mem = nullptr;
        ctx.d_state_curr.fill(nullptr);
    }
    update_state_ready(ctx);
    return best;
}

static int ensure_resident_bank_avoiding(StepContext &ctx,
                                         double **host_key,
                                         size_t all,
                                         bool upload_if_missing,
                                         int avoid_a,
                                         int avoid_b,
                                         int avoid_c)
{
    if (!resident_key_usable(host_key)) {
        if (ctx.current_bank >= 0)
            return ctx.current_bank;
        return 0;
    }

    int bank = find_resident_bank(ctx, host_key);
    if (bank >= 0) {
        ctx.resident_age[bank] = ++ctx.resident_clock;
        if (!ctx.resident_valid[bank] && upload_if_missing) {
            bind_state_input_slots(ctx.d_resident[bank]);
            upload_state_inputs(host_key, all);
            ctx.resident_valid[bank] = true;
            set_resident_host_clean(ctx, bank, true);
        }
        return bank;
    }

    bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
    assign_resident_key(ctx, bank, host_key);
    if (upload_if_missing) {
        bind_state_input_slots(ctx.d_resident[bank]);
        upload_state_inputs(host_key, all);
        ctx.resident_valid[bank] = true;
        set_resident_host_clean(ctx, bank, true);
    } else {
        ctx.resident_valid[bank] = false;
        set_resident_host_clean(ctx, bank, false);
    }
    update_state_ready(ctx);
    return bank;
}

static int ensure_em_resident_bank_avoiding(StepContext &ctx,
                                            double **host_key,
                                            size_t all,
                                            bool upload_if_missing,
                                            int avoid_a,
                                            int avoid_b,
                                            int avoid_c)
{
    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
        if (ctx.current_bank >= 0)
            return ctx.current_bank;
        return 0;
    }

    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
    if (bank >= 0) {
        ctx.resident_age[bank] = ++ctx.resident_clock;
        if (!ctx.resident_valid[bank] && upload_if_missing) {
            bind_em_state_input_slots(ctx.d_resident[bank]);
            upload_em_state_inputs(host_key, all);
            CUDA_CHECK(cudaDeviceSynchronize());
            ctx.resident_valid[bank] = true;
            set_resident_host_clean(ctx, bank, true);
        }
        return bank;
    }

    bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
    if (upload_if_missing) {
        bind_em_state_input_slots(ctx.d_resident[bank]);
        upload_em_state_inputs(host_key, all);
        CUDA_CHECK(cudaDeviceSynchronize());
        ctx.resident_valid[bank] = true;
        set_resident_host_clean(ctx, bank, true);
    } else {
        ctx.resident_valid[bank] = false;
        set_resident_host_clean(ctx, bank, false);
    }
    update_state_ready(ctx);
    return bank;
}

static int ensure_escalar_resident_bank_avoiding(StepContext &ctx,
                                                 double **host_key,
                                                 size_t all,
                                                 bool upload_if_missing,
                                                 int avoid_a,
                                                 int avoid_b,
                                                 int avoid_c)
{
    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
        if (ctx.current_bank >= 0)
            return ctx.current_bank;
        return 0;
    }

    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
    if (bank >= 0) {
        ctx.resident_age[bank] = ++ctx.resident_clock;
        if (!ctx.resident_valid[bank] && upload_if_missing) {
            bind_escalar_state_input_slots(ctx.d_resident[bank]);
            upload_escalar_state_inputs(host_key, all);
            CUDA_CHECK(cudaDeviceSynchronize());
            ctx.resident_valid[bank] = true;
            set_resident_host_clean(ctx, bank, true);
        }
        return bank;
    }

    bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
    if (upload_if_missing) {
        bind_escalar_state_input_slots(ctx.d_resident[bank]);
        upload_escalar_state_inputs(host_key, all);
        CUDA_CHECK(cudaDeviceSynchronize());
        ctx.resident_valid[bank] = true;
        set_resident_host_clean(ctx, bank, true);
    } else {
        ctx.resident_valid[bank] = false;
        set_resident_host_clean(ctx, bank, false);
    }
    update_state_ready(ctx);
    return bank;
}

static int active_or_keyed_bank(StepContext &ctx,
                                double **host_key,
                                size_t all,
                                bool upload_if_missing,
                                int state_count = BSSN_STATE_COUNT)
{
    if (state_count == BSSN_ESCALAR_STATE_COUNT &&
        resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
        int bank = ensure_escalar_resident_bank(ctx, host_key, all, upload_if_missing);
        mark_resident_current_bank(ctx, bank);
        return bank;
    }
    if (state_count == BSSN_EM_STATE_COUNT &&
        resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
        int bank = ensure_em_resident_bank(ctx, host_key, all, upload_if_missing);
        mark_resident_current_bank(ctx, bank);
        return bank;
    }
    if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) {
        int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
        mark_resident_current_bank(ctx, bank);
        return bank;
    }
    if (ctx.current_bank >= 0)
        return ctx.current_bank;
    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        if (ctx.resident_valid[b]) {
            mark_resident_current_bank(ctx, b);
            return b;
        }
    }
    return 0;
}

static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false,
                                double escalar_a2 = 3.0)
{
    const double SYM = 1.0;
    const double ANTI = -1.0;
    const bool stage_timing = rhs_stage_timing_enabled();
    double stage_ms[RHS_STAGE_COUNT] = {};
    double stage_t0 = stage_timing ? cuda_profile_now_ms() : 0.0;

    #define D(s) g_buf.slot[s]
    #define MARK_RHS_STAGE(stage_id) do {                             \
        if (stage_timing) {                                            \
            cuda_profile_sync();                                       \
            const double stage_t1 = cuda_profile_now_ms();             \
            stage_ms[(stage_id)] += stage_t1 - stage_t0;               \
            stage_t0 = stage_t1;                                       \
        }                                                             \
    } while (0)

    kern_phase1_prep<<<grid(all),BLK>>>(
        D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
        D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
    MARK_RHS_STAGE(RHS_STAGE_PREP);

    {
        double *src_fields[] = {
            D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
            D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy),
            D(S_gyz), D(S_dzz), D(S_Lap), D(S_trK)
        };
        double *fx_fields[] = {
            D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
            D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx),
            D(S_gyzx), D(S_gzzx), D(S_Lapx), D(S_Kx)
        };
        double *fy_fields[] = {
            D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
            D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy),
            D(S_gyzy), D(S_gzzy), D(S_Lapy), D(S_Ky)
        };
        double *fz_fields[] = {
            D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
            D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz),
            D(S_gyzz), D(S_gzzz), D(S_Lapz), D(S_Kz)
        };
        const int soa_signs[] = {
            (int)ANTI, (int)SYM,  (int)SYM,
            (int)SYM,  (int)ANTI, (int)SYM,
            (int)SYM,  (int)SYM,  (int)ANTI,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)ANTI, (int)ANTI, (int)SYM,
            (int)ANTI, (int)SYM,  (int)ANTI,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)ANTI, (int)ANTI,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM
        };
        gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
                          src_fields, fx_fields, fy_fields, fz_fields,
                          soa_signs, all);
    }
    MARK_RHS_STAGE(RHS_STAGE_DERIV1);

    kern_phase2_metric_rhs<<<grid(all),BLK>>>(
        D(S_alpn1), D(S_chin1),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_trK),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        D(S_betaxx), D(S_betaxy), D(S_betaxz),
        D(S_betayx), D(S_betayy), D(S_betayz),
        D(S_betazx), D(S_betazy), D(S_betazz),
        D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
        D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));

    kern_phase2_inverse<<<grid(all),BLK>>>(
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz));

    if (compute_escalar) {
        gpu_escalar_sources(all, escalar_a2);
        gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all);
    }

    if (co == 0) {
        kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
            D(S_Gamx), D(S_Gamy), D(S_Gamz),
            D(S_gupxx), D(S_gupxy), D(S_gupxz),
            D(S_gupyy), D(S_gupyz), D(S_gupzz),
            D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
            D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
            D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
            D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
    }

    kern_phase4_christoffel<<<grid(all),BLK>>>(
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
        D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
        D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));

    kern_phase5_6_gamma_rhs_part1_fused<<<grid(all),BLK>>>(
        D(S_Lapx), D(S_Lapy), D(S_Lapz),
        D(S_alpn1), D(S_chin1),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        compute_escalar ? D(S_trK_x) : D(S_Kx),
        compute_escalar ? D(S_trK_y) : D(S_Ky),
        compute_escalar ? D(S_trK_z) : D(S_Kz),
        D(S_Sx), D(S_Sy), D(S_Sz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
    MARK_RHS_STAGE(RHS_STAGE_METRIC);

    {
        double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)};
        double *fxx_fields[] = {D(S_gxxx), D(S_gxxy), D(S_gxxz)};
        double *fxy_fields[] = {D(S_gxyx), D(S_gxyy), D(S_gxyz)};
        double *fxz_fields[] = {D(S_gxzx), D(S_gxzy), D(S_gxzz)};
        double *fyy_fields[] = {D(S_gyyx), D(S_gyyy), D(S_gyyz)};
        double *fyz_fields[] = {D(S_gyzx), D(S_gyzy), D(S_gyzz)};
        double *fzz_fields[] = {D(S_gzzx), D(S_gzzy), D(S_gzzz)};
        const int soa_signs[] = {
            (int)ANTI, (int)SYM,  (int)SYM,
            (int)SYM,  (int)ANTI, (int)SYM,
            (int)SYM,  (int)SYM,  (int)ANTI
        };
        gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
                           src_fields, fxx_fields, fxy_fields, fxz_fields,
                           fyy_fields, fyz_fields, fzz_fields,
                           soa_signs, all);
    }
    {
        double *src_fields[] = {D(S_Gamx), D(S_Gamy), D(S_Gamz)};
        double *fx_fields[] = {D(S_Gamxx), D(S_Gamyx), D(S_Gamzx)};
        double *fy_fields[] = {D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy)};
        double *fz_fields[] = {D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t)};
        const int soa_signs[] = {
            (int)ANTI, (int)SYM,  (int)SYM,
            (int)SYM,  (int)ANTI, (int)SYM,
            (int)SYM,  (int)SYM,  (int)ANTI
        };
        gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
                          src_fields, fx_fields, fy_fields, fz_fields,
                          soa_signs, all);
    }
    MARK_RHS_STAGE(RHS_STAGE_GAUGE_DERIV);

    kern_phase8_9_gamma_rhs_contract_fused<<<grid(all),BLK>>>(
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
        D(S_betaxx),D(S_betaxy),D(S_betaxz),
        D(S_betayx),D(S_betayy),D(S_betayz),
        D(S_betazx),D(S_betazy),D(S_betazz),
        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
        D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
        D(S_Gamxa),D(S_Gamya),D(S_Gamza),
        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
    MARK_RHS_STAGE(RHS_STAGE_GAMMA_CONTRACT);

    {
        double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
        double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
        const int soa_signs[] = {
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)ANTI, (int)ANTI, (int)SYM,
            (int)ANTI, (int)SYM,  (int)ANTI,
            (int)SYM,  (int)ANTI, (int)ANTI
        };
        gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
                                D(S_gupyy), D(S_gupyz), D(S_gupzz),
                                src_fields, dst_fields, soa_signs, all);
    }
    MARK_RHS_STAGE(RHS_STAGE_RICCI_DIFF);

    kern_phase11_ricci_fused<<<grid(all),BLK>>>(
        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
        D(S_Gamxa),D(S_Gamya),D(S_Gamza),
        D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
        D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
        D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
        D(S_Rxx),D(S_Rxy),D(S_Rxz),
        D(S_Ryy),D(S_Ryz),D(S_Rzz));
    MARK_RHS_STAGE(RHS_STAGE_RICCI_FUSED);

    kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
        D(S_chi), D(S_chin1),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
        D(S_Rxx), D(S_Rxy), D(S_Rxz),
        D(S_Ryy), D(S_Ryz), D(S_Rzz));
    MARK_RHS_STAGE(RHS_STAGE_CHI);

    kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
        D(S_alpn1), D(S_chin1),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_trK),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        D(S_Lapx), D(S_Lapy), D(S_Lapz),
        D(S_betaxx), D(S_betaxy), D(S_betaxz),
        D(S_betayx), D(S_betayy), D(S_betayz),
        D(S_betazx), D(S_betazy), D(S_betazz),
        D(S_rho),
        D(S_Sx), D(S_Sy), D(S_Sz),
        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
        D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
        D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
        D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
        D(S_trK_rhs),
        D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
        D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
        D(S_Lap_rhs),
        D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
        D(S_f_arr), D(S_S_arr));
    MARK_RHS_STAGE(RHS_STAGE_GAUGE_RHS);

    gpu_lopsided_kodis_state_batch(eps, all, compute_escalar);
    MARK_RHS_STAGE(RHS_STAGE_KODIS);

    if (co == 0) {
        {
            double *src_fields[] = {D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz)};
            double *fx_fields[] = {D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx)};
            double *fy_fields[] = {D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy)};
            double *fz_fields[] = {D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz)};
            const int soa_signs[] = {
                (int)SYM,  (int)SYM,  (int)SYM,
                (int)ANTI, (int)ANTI, (int)SYM,
                (int)ANTI, (int)SYM,  (int)ANTI,
                (int)SYM,  (int)SYM,  (int)SYM,
                (int)SYM,  (int)ANTI, (int)ANTI,
                (int)SYM,  (int)SYM,  (int)SYM
            };
            gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
                              src_fields, fx_fields, fy_fields, fz_fields,
                              soa_signs, all);
        }

        kern_phase18_constraints<<<grid(all),BLK>>>(
            D(S_chin1),
            D(S_chix), D(S_chiy), D(S_chiz),
            D(S_gupxx), D(S_gupxy), D(S_gupxz),
            D(S_gupyy), D(S_gupyz), D(S_gupzz),
            D(S_trK),
            D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
            D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
            D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
            compute_escalar ? D(S_trK_x) : D(S_Kx),
            compute_escalar ? D(S_trK_y) : D(S_Ky),
            compute_escalar ? D(S_trK_z) : D(S_Kz),
            D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
            D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
            D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
            D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
            D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
            D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
            D(S_gxxx), D(S_gxxy), D(S_gxxz),
            D(S_gxyx), D(S_gxyy), D(S_gxyz),
            D(S_gxzx), D(S_gxzy), D(S_gxzz),
            D(S_gyyx), D(S_gyyy), D(S_gyyz),
            D(S_gyzx), D(S_gyzy), D(S_gyzz),
            D(S_gzzx), D(S_gzzy), D(S_gzzz),
            D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
    }
    MARK_RHS_STAGE(RHS_STAGE_CONSTRAINTS);

    rhs_stage_profile_accumulate(stage_ms);
    #undef MARK_RHS_STAGE
    #undef D
}

static void download_state_outputs(double **state_host_out, size_t all)
{
    const size_t bytes = all * sizeof(double);
    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
                          (size_t)BSSN_STATE_COUNT * bytes,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
}

static void download_escalar_state_outputs(double **state_host_out, size_t all)
{
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
        CUDA_CHECK(cudaMemcpyAsync(state_host_out[i],
                                   g_buf.slot[k_escalar_state_rhs_slots[i]],
                                   bytes, cudaMemcpyDeviceToHost));
    }
    CUDA_CHECK(cudaDeviceSynchronize());
}

static void download_em_state_outputs(double **state_host_out, size_t all)
{
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
        CUDA_CHECK(cudaMemcpyAsync(state_host_out[i],
                                   g_buf.slot[k_em_state_rhs_slots[i]],
                                   bytes, cudaMemcpyDeviceToHost));
    }
    CUDA_CHECK(cudaDeviceSynchronize());
}

static void upload_escalar_state_inputs(double **state_host, size_t all)
{
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
        CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_escalar_state_input_slots[i]],
                                   state_host[i], bytes, cudaMemcpyHostToDevice));
    }
}

static void upload_em_state_inputs(double **state_host, size_t all)
{
    const size_t bytes = all * sizeof(double);
    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
        CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_em_state_input_slots[i]],
                                   state_host[i], bytes, cudaMemcpyHostToDevice));
    }
}

static bool em_source_cache_enabled()
{
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_CUDA_EM_CACHE_SOURCES");
        enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    return enabled != 0;
}

static void bind_em_fixed_source_slots(StepContext &ctx)
{
    const int slots[BSSN_EM_SOURCE_COUNT] = {
        S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar
    };
    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i)
        g_buf.slot[slots[i]] = ctx.d_em_source[i];
}

static bool em_fixed_source_cache_matches(const StepContext &ctx,
                                          double **source_host)
{
    if (!ctx.em_source_ready || !source_host)
        return false;
    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
        if (!source_host[i] || ctx.em_source_host[i] != source_host[i])
            return false;
    }
    return true;
}

static void upload_em_fixed_sources(StepContext &ctx,
                                    double **source_host,
                                    size_t all)
{
    const size_t bytes = all * sizeof(double);
    bind_em_fixed_source_slots(ctx);
    if (em_source_cache_enabled() &&
        em_fixed_source_cache_matches(ctx, source_host)) {
        return;
    }
    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
        CUDA_CHECK(cudaMemcpyAsync(ctx.d_em_source[i],
                                   source_host[i], bytes, cudaMemcpyHostToDevice));
    }
    if (em_source_cache_enabled()) {
        for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i)
            ctx.em_source_host[i] = source_host[i];
        ctx.em_source_ready = true;
    } else {
        ctx.em_source_host.fill(nullptr);
        ctx.em_source_ready = false;
    }
}

__global__ void kern_check_em_zero_fields(int *flag,
                                          const double * __restrict__ Kpsi,
                                          const double * __restrict__ Kphi,
                                          const double * __restrict__ Ex,
                                          const double * __restrict__ Ey,
                                          const double * __restrict__ Ez,
                                          const double * __restrict__ Bx,
                                          const double * __restrict__ By,
                                          const double * __restrict__ Bz,
                                          const double * __restrict__ Jx,
                                          const double * __restrict__ Jy,
                                          const double * __restrict__ Jz,
                                          const double * __restrict__ qchar,
                                          int all)
{
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < all;
         i += blockDim.x * gridDim.x)
    {
        if (Kpsi[i] != 0.0 || Kphi[i] != 0.0 ||
            Ex[i] != 0.0 || Ey[i] != 0.0 || Ez[i] != 0.0 ||
            Bx[i] != 0.0 || By[i] != 0.0 || Bz[i] != 0.0 ||
            Jx[i] != 0.0 || Jy[i] != 0.0 || Jz[i] != 0.0 ||
            qchar[i] != 0.0)
        {
            atomicExch(flag, 0);
        }
    }
}

static bool em_zero_fast_path_enabled()
{
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_CUDA_EM_ZERO_FASTPATH");
        enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    return enabled != 0;
}

static bool em_detect_zero_fast_path(StepContext &ctx,
                                     int input_bank,
                                     size_t all)
{
    if (!em_zero_fast_path_enabled())
        return false;
    if (ctx.em_zero_fast_known)
        return ctx.em_zero_fast;
    if (input_bank < 0 || input_bank >= BSSN_RESIDENT_BANK_COUNT)
        return false;

    if (!g_em_zero_flag)
        CUDA_CHECK(cudaMalloc(&g_em_zero_flag, sizeof(int)));

    int h_flag = 1;
    CUDA_CHECK(cudaMemcpy(g_em_zero_flag, &h_flag, sizeof(int), cudaMemcpyHostToDevice));
    kern_check_em_zero_fields<<<grid(all), BLK>>>(
        g_em_zero_flag,
        ctx.d_resident[input_bank][24], ctx.d_resident[input_bank][25],
        ctx.d_resident[input_bank][26], ctx.d_resident[input_bank][27],
        ctx.d_resident[input_bank][28], ctx.d_resident[input_bank][29],
        ctx.d_resident[input_bank][30], ctx.d_resident[input_bank][31],
        ctx.d_em_source[0], ctx.d_em_source[1], ctx.d_em_source[2], ctx.d_em_source[3],
        (int)all);
    CUDA_CHECK(cudaMemcpy(&h_flag, g_em_zero_flag, sizeof(int), cudaMemcpyDeviceToHost));
    ctx.em_zero_fast = (h_flag != 0);
    ctx.em_zero_fast_known = true;
    return ctx.em_zero_fast;
}

static void zero_em_output_slots_async(size_t all)
{
    for (int i = BSSN_STATE_COUNT; i < BSSN_EM_STATE_COUNT; ++i)
        CUDA_CHECK(cudaMemsetAsync(g_buf.slot[k_em_state_rhs_slots[i]], 0, all * sizeof(double)));
}

static void download_constraint_outputs(double **constraint_host_out, size_t all)
{
    const size_t bytes = all * sizeof(double);
    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
                          (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
        std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
}

extern "C"
int bssn_escalar_cuda_rk4_substep(void *block_tag,
                                  int *ex, double *X, double *Y, double *Z,
                                  double **state_host_in,
                                  double **state_host_out,
                                  const double *propspeed,
                                  const double *soa_flat,
                                  const double *bbox,
                                  double &dT,
                                  double &T,
                                  int &RK4,
                                  int &apply_bam_bc,
                                  int &Symmetry,
	                                  int &Lev,
	                                  double &eps,
	                                  int &co,
	                                  int &keep_resident_state,
	                                  int &apply_enforce_ga,
	    double &chitiny)
{
    (void)T;
    if (RK4 < 0 || RK4 > 3) return 1;

    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));

    double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0;
#ifdef fortran1
    set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif
#ifdef fortran2
    SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif
#ifdef fortran3
    set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif
    if (fabs(escalar_a2) <= 1.0e-300 && g_dispatch.my_rank == 0) {
        fprintf(stderr, "CUDA BSSN-EScalar requires nonzero FR a2; got %.17g\n", escalar_a2);
        return 1;
    }

    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    int touch_xmin = 0, touch_xmax = 0;
    int touch_ymin = 0, touch_ymax = 0;
    int touch_zmin = 0, touch_zmax = 0;

    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
    if (Lev > 0) {
        compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
                                     touch_xmin, touch_xmax,
                                     touch_ymin, touch_ymax,
                                     touch_zmin, touch_zmax);
    }

    StepContext &ctx = ensure_step_ctx(block_tag, all);
    const bool use_resident_state = (keep_resident_state != 0);
    int input_bank = -1;
    int output_bank = -1;
    if (use_resident_state) {
        input_bank = ensure_escalar_resident_bank(ctx, state_host_in, all, true);
        output_bank = reserve_escalar_resident_output_bank(ctx, state_host_out, all, input_bank);
        mark_resident_current_bank(ctx, input_bank);
        mark_resident_next_bank(ctx, output_bank);
        bind_escalar_state_input_slots(ctx.d_resident[input_bank]);
        bind_escalar_state_output_slots(ctx.d_resident[output_bank]);
    } else {
        upload_escalar_state_inputs(state_host_in, all);
    }

    if (apply_enforce_ga) {
        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
        if (use_resident_state && input_bank >= 0)
            set_resident_host_clean(ctx, input_bank, false);
    }

    if (RK4 == 0) {
        if (use_resident_state) {
            CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank],
                                  (size_t)BSSN_ESCALAR_STATE_COUNT * bytes,
                                  cudaMemcpyDeviceToDevice));
        } else {
            CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
                                  (size_t)BSSN_STATE_COUNT * bytes,
                                  cudaMemcpyDeviceToDevice));
            CUDA_CHECK(cudaMemcpy(ctx.d_state0[24], g_buf.slot[S_Sphi],
                                  bytes, cudaMemcpyDeviceToDevice));
            CUDA_CHECK(cudaMemcpy(ctx.d_state0[25], g_buf.slot[S_Spi],
                                  bytes, cudaMemcpyDeviceToDevice));
        }
    }

    launch_rhs_pipeline((int)all, eps, co, true, escalar_a2);

    if (apply_bam_bc) {
        for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
            gpu_sommerfeld_routbam(g_buf.slot[k_escalar_state_input_slots[i]],
                                   g_buf.slot[k_escalar_state_rhs_slots[i]],
                                   propspeed[i],
                                   soa_flat[3 * i + 0],
                                   soa_flat[3 * i + 1],
                                   soa_flat[3 * i + 2],
                                   X, Y, Z, bbox, Symmetry);
        }
    }

    gpu_escalar_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
    if (Lev > 0) {
        gpu_escalar_restore_patch_boundary_batch(ctx, (int)all,
                                                 touch_xmin, touch_xmax,
                                                 touch_ymin, touch_ymax,
                                                 touch_zmin, touch_zmax);
    }
    if (use_resident_state) {
        ctx.resident_valid[output_bank] = true;
        ctx.resident_age[output_bank] = ++ctx.resident_clock;
        set_resident_host_clean(ctx, output_bank, false);
        mark_resident_current_bank(ctx, output_bank);
        update_state_ready(ctx);
    } else {
        download_escalar_state_outputs(state_host_out, all);
    }
    if (RK4 == 3)
        ctx.matter_ready = false;
    return 0;
}

extern "C"
int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
                                          double **state_host_in,
                                          double **constraint_host_out,
                                          int &Symmetry,
                                          int &Lev,
                                          double &eps)
{
    if (!state_host_in || !constraint_host_out) return 1;

    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));

    double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0;
#ifdef fortran1
    set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif
#ifdef fortran2
    SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif
#ifdef fortran3
    set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
#endif

    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0);
    upload_escalar_state_inputs(state_host_in, all);
    launch_rhs_pipeline((int)all, eps, 0, true, escalar_a2);

    #define D(s) g_buf.slot[s]
    kern_escalar_constraint_fr<<<grid(all), BLK>>>(
        D(S_chin1),
        D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_trK),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
        D(S_rho),
        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
        D(S_Sphi),
        escalar_a2,
        D(S_f_arr));
    #undef D

    download_constraint_outputs(constraint_host_out, all);
    CUDA_CHECK(cudaMemcpy(constraint_host_out[7], g_buf.slot[S_f_arr],
                          bytes, cudaMemcpyDeviceToHost));
    (void)Lev;
    return 0;
}

__global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
                                              const double * __restrict__ src2,
                                              const double * __restrict__ src3,
                                              double * __restrict__ dst,
                                              double c1,
                                              double c2,
                                              double c3,
                                              int state_count,
                                              int all)
{
    const int state = blockIdx.y;
    if (state >= state_count) return;
    const size_t off = (size_t)state * all;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < all;
         i += blockDim.x * gridDim.x)
    {
        const double v3 = src3 ? src3[off + i] : 0.0;
        dst[off + i] = c1 * src1[off + i] + c2 * src2[off + i] + c3 * v3;
    }
}

__device__ double interp_lagrange_weight(int idx, double x, int ordn)
{
    double w = 1.0;
    const double xi = (double)idx;
    for (int j = 0; j < ordn; ++j) {
        if (j == idx) continue;
        w *= (x - (double)j) / (xi - (double)j);
    }
    return w;
}

__device__ void interp_axis_window(double p,
                                   double x0,
                                   double dx,
                                   int n,
                                   int ordn,
                                   int symmetry,
                                   int axis,
                                   int &base,
                                   double &local_x)
{
    int cx_i = (int)((p - x0) / dx + 0.4) + 1;
    int cx_b = cx_i - ordn / 2 + 1;
    int cx_t = cx_b + ordn - 1;
    int cmin = 1;
    if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
        cmin = -ordn / 2 + 1;
    if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
        cmin = -ordn / 2 + 1;

    if (cx_b < cmin) {
        cx_b = cmin;
        cx_t = cx_b + ordn - 1;
    }
    if (cx_t > n) {
        cx_t = n;
        cx_b = cx_t + 1 - ordn;
    }

    base = cx_b;
    if (cx_b > 0) {
        const double xb = x0 + (double)(cx_b - 1) * dx;
        local_x = (p - xb) / dx;
    } else {
        const int reflected = 1 - cx_b;
        const double xb = x0 + (double)(reflected - 1) * dx;
        local_x = (p + xb) / dx;
    }
}

__device__ double load_interp_value(const double * __restrict__ mem,
                                    int nx,
                                    int ny,
                                    int nz,
                                    int all,
                                    int state,
                                    int fi,
                                    int fj,
                                    int fk,
                                    const double * __restrict__ soa)
{
    double sign = 1.0;
    int ii = fi;
    int jj = fj;
    int kk = fk;
    if (ii <= 0) {
        ii = 1 - ii;
        sign *= soa[0];
    }
    if (jj <= 0) {
        jj = 1 - jj;
        sign *= soa[1];
    }
    if (kk <= 0) {
        kk = 1 - kk;
        sign *= soa[2];
    }
    if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
        return 0.0;
    const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
    return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
}

__global__ void kern_interp_state_point3(const double * __restrict__ mem,
                                         double * __restrict__ out,
                                         int nx,
                                         int ny,
                                         int nz,
                                         int all,
                                         int state0,
                                         int state1,
                                         int state2,
                                         double x0,
                                         double y0,
                                         double z0,
                                         double dx,
                                         double dy,
                                         double dz,
                                         double px,
                                         double py,
                                         double pz,
                                         int ordn,
                                         int symmetry,
                                         double soa00, double soa01, double soa02,
                                         double soa10, double soa11, double soa12,
                                         double soa20, double soa21, double soa22)
{
    const int f = threadIdx.x;
    if (f >= 3 || ordn <= 0 || ordn > 8)
        return;

    const int states[3] = {state0, state1, state2};
    const double soa_all[9] = {
        soa00, soa01, soa02,
        soa10, soa11, soa12,
        soa20, soa21, soa22
    };
    const double *soa = soa_all + 3 * f;

    int ib, jb, kb;
    double tx, ty, tz;
    interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
    interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
    interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);

    double wx[8], wy[8], wz[8];
    for (int i = 0; i < ordn; ++i) {
        wx[i] = interp_lagrange_weight(i, tx, ordn);
        wy[i] = interp_lagrange_weight(i, ty, ordn);
        wz[i] = interp_lagrange_weight(i, tz, ordn);
    }

    double value = 0.0;
    for (int k = 0; k < ordn; ++k) {
        for (int j = 0; j < ordn; ++j) {
            for (int i = 0; i < ordn; ++i) {
                const double coeff = wx[i] * wy[j] * wz[k];
                value += coeff * load_interp_value(mem, nx, ny, nz, all,
                                                    states[f],
                                                    ib + i, jb + j, kb + k,
                                                    soa);
            }
        }
    }
    out[f] = value;
}

__global__ void kern_interp_host_two_fields(const double * __restrict__ field0,
                                            const double * __restrict__ field1,
                                            const double * __restrict__ px,
                                            const double * __restrict__ py,
                                            const double * __restrict__ pz,
                                            double * __restrict__ out,
                                            int nx,
                                            int ny,
                                            int nz,
                                            int all,
                                            double x0,
                                            double y0,
                                            double z0,
                                            double dx,
                                            double dy,
                                            double dz,
                                            int npoints,
                                            int ordn,
                                            int symmetry,
                                            double soa00, double soa01, double soa02,
                                            double soa10, double soa11, double soa12)
{
    const int p = blockIdx.x * blockDim.x + threadIdx.x;
    if (p >= npoints || ordn <= 0 || ordn > 8)
        return;

    int ib, jb, kb;
    double tx, ty, tz;
    interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx);
    interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty);
    interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz);

    double wx[8], wy[8], wz[8];
    for (int i = 0; i < ordn; ++i) {
        wx[i] = interp_lagrange_weight(i, tx, ordn);
        wy[i] = interp_lagrange_weight(i, ty, ordn);
        wz[i] = interp_lagrange_weight(i, tz, ordn);
    }

    double v0 = 0.0;
    double v1 = 0.0;
    const double soa0[3] = {soa00, soa01, soa02};
    const double soa1[3] = {soa10, soa11, soa12};
    for (int k = 0; k < ordn; ++k) {
        for (int j = 0; j < ordn; ++j) {
            const double wyz = wy[j] * wz[k];
            for (int i = 0; i < ordn; ++i) {
                const double coeff = wx[i] * wyz;
                v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0,
                                                ib + i, jb + j, kb + k, soa0);
                v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0,
                                                ib + i, jb + j, kb + k, soa1);
            }
        }
    }
    out[2 * p] = v0;
    out[2 * p + 1] = v1;
}

__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
                                             double * __restrict__ dst,
                                             int nx, int ny,
                                             int i0, int j0, int k0,
                                             int sx, int sy, int sz,
                                             int region_all,
                                             int state_count,
                                             int all)
{
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
        dst[(size_t)state_index * region_all + local] =
            src_mem[(size_t)state_index * all + src];
    }
}

__global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
                                               const double * __restrict__ src,
                                               int nx, int ny,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz,
                                               int region_all,
                                               int state_count,
                                               int all)
{
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
        dst_mem[(size_t)state_index * all + dst] =
            src[(size_t)state_index * region_all + local];
    }
}

__global__ void kern_pack_state_segments_batch(const double * __restrict__ src_mem,
                                               double * __restrict__ dst,
                                               int nx, int ny,
                                               const int * __restrict__ meta,
                                               int state_count,
                                               int all)
{
    const int segment = blockIdx.z;
    const int state_index = blockIdx.y;
    const int *m = meta + segment * 8;
    const int i0 = m[0], j0 = m[1], k0 = m[2];
    const int sx = m[3], sy = m[4];
    const int region_all = m[6];
    const int offset = m[7];
    if (state_index >= state_count) return;
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
        dst[(size_t)offset + (size_t)state_index * region_all + local] =
            src_mem[(size_t)state_index * all + src];
    }
}

__global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem,
                                                 const double * __restrict__ src,
                                                 int nx, int ny,
                                                 const int * __restrict__ meta,
                                                 int state_count,
                                                 int all)
{
    const int segment = blockIdx.z;
    const int state_index = blockIdx.y;
    const int *m = meta + segment * 8;
    const int i0 = m[0], j0 = m[1], k0 = m[2];
    const int sx = m[3], sy = m[4];
    const int region_all = m[6];
    const int offset = m[7];
    if (state_index >= state_count) return;
    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
        dst_mem[(size_t)state_index * all + dst] =
            src[(size_t)offset + (size_t)state_index * region_all + local];
    }
}

__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem,
                                                           int state_index,
                                                           int x, int y, int z,
                                                           int nx, int ny,
                                                           int all);

__global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem,
                                                 double * __restrict__ dst,
                                                 int nx, int ny,
                                                 int sx, int sy, int sz,
                                                 int fi0, int fj0, int fk0,
                                                 int region_all,
                                                 int state_count,
                                                 int all)
{
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
#if ghost_width == 5
    const double c1 = 35.0 / 65536.0;
    const double c2 = -405.0 / 65536.0;
    const double c3 = 567.0 / 16384.0;
    const double c4 = -2205.0 / 16384.0;
    const double c5 = 19845.0 / 32768.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
    const int nst = 10;
#elif ghost_width == 4
    const double c1 = -5.0 / 2048.0;
    const double c2 = 49.0 / 2048.0;
    const double c3 = -245.0 / 2048.0;
    const double c4 = 1225.0 / 2048.0;
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
    const int nst = 8;
#elif ghost_width == 3
    const double c1 = 3.0 / 256.0;
    const double c2 = -25.0 / 256.0;
    const double c3 = 75.0 / 128.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double w[6] = {c1, c2, c3, c3, c2, c1};
    const int nst = 6;
#else
    const double c1 = -1.0 / 16.0;
    const double c2 = 9.0 / 16.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double w[4] = {c1, c2, c2, c1};
    const int nst = 4;
#endif

    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int fc_i = fi0 + 2 * ii;
        const int fc_j = fj0 + 2 * jj;
        const int fc_k = fk0 + 2 * kk;
        double sum = 0.0;
        for (int oz = 0; oz < nst; ++oz)
        {
            const int z = fc_k + offs[oz];
            const double wz = w[oz];
            for (int oy = 0; oy < nst; ++oy)
            {
                const int y = fc_j + offs[oy];
                const double wyz = wz * w[oy];
                for (int ox = 0; ox < nst; ++ox)
                {
                    const int x = fc_i + offs[ox];
                    sum += wyz * w[ox] *
                           load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
                }
            }
        }
        dst[(size_t)state_index * region_all + local] = sum;
    }
}

__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem,
                                                           int state_index,
                                                           int x, int y, int z,
                                                           int nx, int ny,
                                                           int all)
{
    double s = 1.0;
    if (x < 0) {
        x = -x - 1;
        s *= d_comm_state_soa[3 * state_index + 0];
    }
    if (y < 0) {
        y = -y - 1;
        s *= d_comm_state_soa[3 * state_index + 1];
    }
    if (z < 0) {
        z = -z - 1;
        s *= d_comm_state_soa[3 * state_index + 2];
    }
    const int src = x + y * nx + z * nx * ny;
    return s * src_mem[(size_t)state_index * all + src];
}

__global__ void kern_restrict_state_segments_batch(const double * __restrict__ src_mem,
                                                   double * __restrict__ dst,
                                                   int nx, int ny,
                                                   const int * __restrict__ meta,
                                                   int state_count,
                                                   int all)
{
    const int segment = blockIdx.z;
    const int state_index = blockIdx.y;
    const int *m = meta + segment * 8;
    const int sx = m[0], sy = m[1];
    const int region_all = m[3];
    const int offset = m[4];
    const int fi0 = m[5], fj0 = m[6], fk0 = m[7];
    if (state_index >= state_count) return;
#if ghost_width == 5
    const double c1 = 35.0 / 65536.0;
    const double c2 = -405.0 / 65536.0;
    const double c3 = 567.0 / 16384.0;
    const double c4 = -2205.0 / 16384.0;
    const double c5 = 19845.0 / 32768.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
    const int nst = 10;
#elif ghost_width == 4
    const double c1 = -5.0 / 2048.0;
    const double c2 = 49.0 / 2048.0;
    const double c3 = -245.0 / 2048.0;
    const double c4 = 1225.0 / 2048.0;
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
    const int nst = 8;
#elif ghost_width == 3
    const double c1 = 3.0 / 256.0;
    const double c2 = -25.0 / 256.0;
    const double c3 = 75.0 / 128.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double w[6] = {c1, c2, c3, c3, c2, c1};
    const int nst = 6;
#else
    const double c1 = -1.0 / 16.0;
    const double c2 = 9.0 / 16.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double w[4] = {c1, c2, c2, c1};
    const int nst = 4;
#endif

    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int fc_i = fi0 + 2 * ii;
        const int fc_j = fj0 + 2 * jj;
        const int fc_k = fk0 + 2 * kk;
        double sum = 0.0;
        for (int oz = 0; oz < nst; ++oz)
        {
            const int z = fc_k + offs[oz];
            const double wz = w[oz];
            for (int oy = 0; oy < nst; ++oy)
            {
                const int y = fc_j + offs[oy];
                const double wyz = wz * w[oy];
                for (int ox = 0; ox < nst; ++ox)
                {
                    const int x = fc_i + offs[ox];
                    sum += wyz * w[ox] *
                           load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
                }
            }
        }
        dst[(size_t)offset + (size_t)state_index * region_all + local] = sum;
    }
}

__global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem,
                                                double * __restrict__ dst,
                                                int nx, int ny,
                                                int sx, int sy, int sz,
                                                int ii0, int jj0, int kk0,
                                                int lbc_i, int lbc_j, int lbc_k,
                                                int region_all,
                                                int state_count,
                                                int all)
{
    const int state_index = blockIdx.y;
    if (state_index >= state_count) return;
#if ghost_width == 5
    const double c1 = 13585.0 / 33554432.0;
    const double c2 = -159885.0 / 33554432.0;
    const double c3 = 230945.0 / 8388608.0;
    const double c4 = -969969.0 / 8388608.0;
    const double c5 = 14549535.0 / 16777216.0;
    const double c6 = 4849845.0 / 16777216.0;
    const double c7 = -692835.0 / 8388608.0;
    const double c8 = 188955.0 / 8388608.0;
    const double c9 = -138567.0 / 33554432.0;
    const double c10 = 12155.0 / 33554432.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
    const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 10;
#elif ghost_width == 4
    const double c1 = -495.0 / 262144.0;
    const double c2 = 5005.0 / 262144.0;
    const double c3 = -27027.0 / 262144.0;
    const double c4 = 225225.0 / 262144.0;
    const double c5 = 75075.0 / 262144.0;
    const double c6 = -19305.0 / 262144.0;
    const double c7 = 4095.0 / 262144.0;
    const double c8 = -429.0 / 262144.0;
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
    const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 8;
#elif ghost_width == 3
    const double c1 = 77.0 / 8192.0;
    const double c2 = -693.0 / 8192.0;
    const double c3 = 3465.0 / 4096.0;
    const double c4 = 1155.0 / 4096.0;
    const double c5 = -495.0 / 8192.0;
    const double c6 = 63.0 / 8192.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double wl[6] = {c1, c2, c3, c4, c5, c6};
    const double wr[6] = {c6, c5, c4, c3, c2, c1};
    const int nst = 6;
#else
    const double c1 = -7.0 / 128.0;
    const double c2 = 105.0 / 128.0;
    const double c3 = 35.0 / 128.0;
    const double c4 = -5.0 / 128.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double wl[4] = {c1, c2, c3, c4};
    const double wr[4] = {c4, c3, c2, c1};
    const int nst = 4;
#endif

    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int fine_i = ii0 + ii;
        const int fine_j = jj0 + jj;
        const int fine_k = kk0 + kk;
        const int ci = fine_i / 2 - lbc_i;
        const int cj = fine_j / 2 - lbc_j;
        const int ck = fine_k / 2 - lbc_k;
        const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr;
        const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr;
        const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr;
        double sum = 0.0;
        for (int oz = 0; oz < nst; ++oz)
        {
            const int z = ck + offs[oz];
            const double wzv = wz[oz];
            for (int oy = 0; oy < nst; ++oy)
            {
                const int y = cj + offs[oy];
                const double wyz = wzv * wy[oy];
                for (int ox = 0; ox < nst; ++ox)
                {
                    const int x = ci + offs[ox];
                    sum += wyz * wx[ox] *
                           load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
                }
            }
        }
        dst[(size_t)state_index * region_all + local] = sum;
    }
}

__global__ void kern_prolong_state_segments_batch(const double * __restrict__ src_mem,
                                                  double * __restrict__ dst,
                                                  int nx, int ny,
                                                  const int * __restrict__ meta,
                                                  int state_count,
                                                  int all)
{
    const int segment = blockIdx.z;
    const int state_index = blockIdx.y;
    const int *m = meta + segment * 11;
    const int sx = m[0], sy = m[1];
    const int region_all = m[3];
    const int offset = m[4];
    const int ii0 = m[5], jj0 = m[6], kk0 = m[7];
    const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10];
    if (state_index >= state_count) return;
#if ghost_width == 5
    const double c1 = 13585.0 / 33554432.0;
    const double c2 = -159885.0 / 33554432.0;
    const double c3 = 230945.0 / 8388608.0;
    const double c4 = -969969.0 / 8388608.0;
    const double c5 = 14549535.0 / 16777216.0;
    const double c6 = 4849845.0 / 16777216.0;
    const double c7 = -692835.0 / 8388608.0;
    const double c8 = 188955.0 / 8388608.0;
    const double c9 = -138567.0 / 33554432.0;
    const double c10 = 12155.0 / 33554432.0;
    const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
    const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
    const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 10;
#elif ghost_width == 4
    const double c1 = -495.0 / 262144.0;
    const double c2 = 5005.0 / 262144.0;
    const double c3 = -27027.0 / 262144.0;
    const double c4 = 225225.0 / 262144.0;
    const double c5 = 75075.0 / 262144.0;
    const double c6 = -19305.0 / 262144.0;
    const double c7 = 4095.0 / 262144.0;
    const double c8 = -429.0 / 262144.0;
    const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
    const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
    const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
    const int nst = 8;
#elif ghost_width == 3
    const double c1 = 77.0 / 8192.0;
    const double c2 = -693.0 / 8192.0;
    const double c3 = 3465.0 / 4096.0;
    const double c4 = 1155.0 / 4096.0;
    const double c5 = -495.0 / 8192.0;
    const double c6 = 63.0 / 8192.0;
    const int offs[6] = {-2, -1, 0, 1, 2, 3};
    const double wl[6] = {c1, c2, c3, c4, c5, c6};
    const double wr[6] = {c6, c5, c4, c3, c2, c1};
    const int nst = 6;
#else
    const double c1 = -7.0 / 128.0;
    const double c2 = 105.0 / 128.0;
    const double c3 = 35.0 / 128.0;
    const double c4 = -5.0 / 128.0;
    const int offs[4] = {-1, 0, 1, 2};
    const double wl[4] = {c1, c2, c3, c4};
    const double wr[4] = {c4, c3, c2, c1};
    const int nst = 4;
#endif

    for (int local = blockIdx.x * blockDim.x + threadIdx.x;
         local < region_all;
         local += blockDim.x * gridDim.x)
    {
        const int ii = local % sx;
        const int jj = (local / sx) % sy;
        const int kk = local / (sx * sy);
        const int fine_i = ii0 + ii;
        const int fine_j = jj0 + jj;
        const int fine_k = kk0 + kk;
        const int ci = fine_i / 2 - lbc_i;
        const int cj = fine_j / 2 - lbc_j;
        const int ck = fine_k / 2 - lbc_k;
        const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr;
        const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr;
        const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr;
        double sum = 0.0;
        for (int oz = 0; oz < nst; ++oz)
        {
            const int z = ck + offs[oz];
            const double wzv = wz[oz];
            for (int oy = 0; oy < nst; ++oy)
            {
                const int y = cj + offs[oy];
                const double wyz = wzv * wy[oy];
                for (int ox = 0; ox < nst; ++ox)
                {
                    const int x = ci + offs[ox];
                    sum += wyz * wx[ox] *
                           load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
                }
            }
        }
        dst[(size_t)offset + (size_t)state_index * region_all + local] = sum;
    }
}

__global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
                                       double * __restrict__ dst,
                                       int subset_count,
                                       int all)
{
    const int subset_slot = blockIdx.y;
    if (subset_slot >= subset_count) return;
    const int state_index = d_subset_state_indices[subset_slot];
    for (int src = blockIdx.x * blockDim.x + threadIdx.x;
         src < all;
         src += blockDim.x * gridDim.x)
    {
        dst[(size_t)subset_slot * all + src] =
            src_mem[(size_t)state_index * all + src];
    }
}

__global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
                                         const double * __restrict__ src,
                                         int subset_count,
                                         int all)
{
    const int subset_slot = blockIdx.y;
    if (subset_slot >= subset_count) return;
    const int state_index = d_subset_state_indices[subset_slot];
    for (int dst = blockIdx.x * blockDim.x + threadIdx.x;
         dst < all;
         dst += blockDim.x * gridDim.x)
    {
        dst_mem[(size_t)state_index * all + dst] =
            src[(size_t)subset_slot * all + dst];
    }
}

static void copy_state_region_cuda(void *block_tag,
                                   int state_index,
                                   double *host_state,
                                   const int *ex,
                                   int i0, int j0, int k0,
                                   int sx, int sy, int sz,
                                   cudaMemcpyKind kind,
                                   double **state_host_key = nullptr)
{
    if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
    if (sx <= 0 || sy <= 0 || sz <= 0) return;

    const size_t pitch = (size_t)ex[0] * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          kind == cudaMemcpyHostToDevice);
    double *base_mem = ctx.d_resident_mem[bank];

    cudaMemcpy3DParms p = {};
    p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
    p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
    p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);

    if (kind == cudaMemcpyDeviceToHost) {
        p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]);
        p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
    } else {
        p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
        p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]);
    }
    CUDA_CHECK(cudaMemcpy3D(&p));
    if (kind == cudaMemcpyHostToDevice) {
        ctx.resident_valid[bank] = true;
        ctx.resident_age[bank] = ++ctx.resident_clock;
        mark_resident_current_bank(ctx, bank);
        mark_resident_host_state_clean(ctx, bank, state_index, false);
        update_state_ready(ctx);
    } else {
        mark_resident_host_state_clean(ctx, bank, state_index, true);
    }
}

static void copy_state_region_packed_cuda(void *block_tag,
                                          int state_index,
                                          double *host_buffer,
                                          const int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz,
                                          cudaMemcpyKind kind,
                                          double **state_host_key = nullptr,
                                          int state_count = BSSN_STATE_COUNT)
{
    if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    if (sx <= 0 || sy <= 0 || sz <= 0) return;

    const size_t src_pitch = (size_t)ex[0] * sizeof(double);
    const size_t dst_pitch = (size_t)sx * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          kind == cudaMemcpyHostToDevice,
                                          state_count);
    double *base_mem = ctx.d_resident_mem[bank];

    cudaMemcpy3DParms p = {};
    p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);

    if (kind == cudaMemcpyDeviceToHost) {
        p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]);
        p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
        p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
        p.dstPos = make_cudaPos(0, 0, 0);
    } else {
        p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
        p.srcPos = make_cudaPos(0, 0, 0);
        p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]);
        p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
    }

    CUDA_CHECK(cudaMemcpy3D(&p));
    if (kind == cudaMemcpyHostToDevice) {
        ctx.resident_valid[bank] = true;
        ctx.resident_age[bank] = ++ctx.resident_clock;
        mark_resident_current_bank(ctx, bank);
        mark_resident_host_state_clean(ctx, bank, state_index, false);
        update_state_ready(ctx);
    } else {
        mark_resident_host_state_clean(ctx, bank, state_index, true);
    }
}

static void copy_state_region_packed_batch_cuda(void *block_tag,
                                                int state_count,
                                                double *host_buffer,
                                                const int *ex,
                                                int i0, int j0, int k0,
                                                int sx, int sy, int sz,
                                                cudaMemcpyKind kind,
                                                double **state_host_key = nullptr)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    if (sx <= 0 || sy <= 0 || sz <= 0) return;

    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          kind == cudaMemcpyHostToDevice,
                                          state_count);
    double *base_mem = ctx.d_resident_mem[bank];
    const int region_all = sx * sy * sz;
    const size_t total_doubles = (size_t)state_count * (size_t)region_all;
    double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);

    if (kind == cudaMemcpyDeviceToHost) {
        dim3 launch_grid((unsigned int)grid((size_t)region_all),
                         (unsigned int)state_count);
        kern_pack_state_region_batch<<<launch_grid, BLK>>>(
            base_mem, d_comm, ex[0], ex[1],
            i0, j0, k0, sx, sy, sz, region_all, state_count,
            ex[0] * ex[1] * ex[2]);
        CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
                              total_doubles * sizeof(double),
                              cudaMemcpyDeviceToHost));
        if (sx == ex[0] && sy == ex[1] && sz == ex[2] &&
            i0 == 0 && j0 == 0 && k0 == 0) {
            mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, true);
        }
    } else {
        CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
                              total_doubles * sizeof(double),
                              cudaMemcpyHostToDevice));
        dim3 launch_grid((unsigned int)grid((size_t)region_all),
                         (unsigned int)state_count);
        kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
            base_mem, d_comm, ex[0], ex[1],
            i0, j0, k0, sx, sy, sz, region_all, state_count,
            ex[0] * ex[1] * ex[2]);
        ctx.resident_valid[bank] = true;
        ctx.resident_age[bank] = ++ctx.resident_clock;
        mark_resident_current_bank(ctx, bank);
        mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, false);
        update_state_ready(ctx);
    }
}

static void download_resident_state_count(void *block_tag, int *ex, double **state_host_out, int state_count)
{
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    int bank = find_resident_bank_count(ctx, state_host_out, state_count);
    bool bank_matches_output_key = (bank >= 0);
    if (bank < 0) {
        bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
    }
    mark_resident_current_bank(ctx, bank);
    if (!bank_matches_output_key &&
        resident_key_usable_count(state_host_out, state_count)) {
        assign_resident_key_count(ctx, bank, state_host_out, state_count);
    }
    const bool profile = cuda_profile_enabled();
    const double t0 = profile ? cuda_profile_now_ms() : 0.0;
    static int direct_download = -1;
    if (direct_download < 0) {
        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
        direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    if (direct_download) {
        if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
            return;
        for (int i = 0; i < state_count; ++i) {
            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
                                       bytes, cudaMemcpyDeviceToHost));
        }
        CUDA_CHECK(cudaDeviceSynchronize());
        set_resident_host_clean(ctx, bank, true);
        if (profile) {
            CudaProfileStats &stats = cuda_profile_stats();
            stats.resident_download_calls++;
            stats.resident_download_ms += cuda_profile_now_ms() - t0;
            stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9;
        }
        return;
    }
    if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
        return;
    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
                          (size_t)state_count * bytes,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < state_count; ++i) {
        std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
    set_resident_host_clean(ctx, bank, true);
    if (profile) {
        CudaProfileStats &stats = cuda_profile_stats();
        stats.resident_download_calls++;
        stats.resident_download_ms += cuda_profile_now_ms() - t0;
        stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9;
    }
}

static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
{
    download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
}

static void upload_resident_state_count(void *block_tag, int *ex, double **state_host_in, int state_count)
{
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    int bank = -1;
    if (state_count == BSSN_ESCALAR_STATE_COUNT) {
        bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false);
        bind_escalar_state_input_slots(ctx.d_resident[bank]);
        upload_escalar_state_inputs(state_host_in, all);
    } else if (state_count == BSSN_EM_STATE_COUNT) {
        bank = ensure_em_resident_bank(ctx, state_host_in, all, false);
        bind_em_state_input_slots(ctx.d_resident[bank]);
        upload_em_state_inputs(state_host_in, all);
    } else if (state_count == BSSN_STATE_COUNT) {
        bank = ensure_resident_bank(ctx, state_host_in, all, false);
        bind_state_input_slots(ctx.d_resident[bank]);
        upload_state_inputs(state_host_in, all);
    } else {
        return;
    }
    CUDA_CHECK(cudaDeviceSynchronize());
    ctx.resident_valid[bank] = true;
    ctx.resident_age[bank] = ++ctx.resident_clock;
    set_resident_host_clean(ctx, bank, true);
    mark_resident_current_bank(ctx, bank);
    update_state_ready(ctx);
}

static void keep_only_resident_state_count(void *block_tag,
                                           int *ex,
                                           double **state_host_key,
                                           int state_count)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return;
    auto it = g_step_ctx.find(block_tag);
    if (it == g_step_ctx.end()) return;

    StepContext &ctx = it->second;
    const int keep_bank = find_resident_bank_count(ctx, state_host_key, state_count);
    if (keep_bank < 0 || !ctx.resident_valid[keep_bank])
        return;

    auto keep_clean = ctx.resident_host_clean[keep_bank];

    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
        ctx.resident_valid[b] = false;
        ctx.resident_host[b].fill(nullptr);
        ctx.resident_host_clean[b].fill(0);
        ctx.resident_age[b] = 0;
    }
    ctx.d_state_curr_mem = nullptr;
    ctx.d_state_next_mem = nullptr;
    ctx.d_state_curr.fill(nullptr);
    ctx.d_state_next.fill(nullptr);
    ctx.current_bank = -1;
    ctx.resident_clock = 0;
    ctx.matter_ready = false;

    for (int i = 0; i < state_count; ++i) {
        ctx.resident_host[keep_bank][i] = state_host_key[i];
        ctx.resident_host_clean[keep_bank][i] = keep_clean[i] ? 1 : 0;
    }
    ctx.resident_valid[keep_bank] = true;
    ctx.resident_age[keep_bank] = ++ctx.resident_clock;
    mark_resident_current_bank(ctx, keep_bank);
    (void)ex;
    update_state_ready(ctx);
}

static bool download_resident_state_count_if_present(void *block_tag,
                                                     int *ex,
                                                     double **state_host_out,
                                                     int state_count);

static bool download_resident_state_if_present(void *block_tag, int *ex, double **state_host_out)
{
    return download_resident_state_count_if_present(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
}

static bool download_resident_state_count_if_present(void *block_tag,
                                                     int *ex,
                                                     double **state_host_out,
                                                     int state_count)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return false;
    auto it = g_step_ctx.find(block_tag);
    if (it == g_step_ctx.end()) return false;

    StepContext &ctx = it->second;
    const int bank = find_resident_bank_count(ctx, state_host_out, state_count);
    if (bank < 0 || !ctx.resident_valid[bank])
        return false;

    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    mark_resident_current_bank(ctx, bank);
    if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
        return true;

    static int direct_download = -1;
    if (direct_download < 0) {
        const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
        direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
    }
    if (direct_download) {
        for (int i = 0; i < state_count; ++i) {
            CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
                                       bytes, cudaMemcpyDeviceToHost));
        }
        CUDA_CHECK(cudaDeviceSynchronize());
    } else {
        CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
                              (size_t)state_count * bytes,
                              cudaMemcpyDeviceToHost));
        for (int i = 0; i < state_count; ++i) {
            std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
        }
    }
    set_resident_host_clean(ctx, bank, true);
    return true;
}

static void copy_state_subset(void *block_tag,
                              int *ex,
                              int subset_count,
                              const int *state_indices,
                              double **state_host,
                              cudaMemcpyKind kind)
{
    if (subset_count <= 0) return;
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr;
    int bank = -1;
    if (state_host) {
        if (full_key) {
            bank = (subset_count == BSSN_ESCALAR_STATE_COUNT)
                 ? find_resident_bank_count(ctx, full_key, BSSN_ESCALAR_STATE_COUNT)
                 : find_resident_bank(ctx, full_key);
        } else {
            bank = find_resident_bank_subset(ctx, state_host, state_indices, subset_count);
        }
        if (kind == cudaMemcpyDeviceToHost &&
            (bank < 0 || !ctx.resident_valid[bank])) {
            bank = -1;
        }
    }
    if (bank < 0) {
        bank = active_or_keyed_bank(ctx, full_key, all,
                                    kind == cudaMemcpyHostToDevice,
                                    subset_count);
    } else {
        mark_resident_current_bank(ctx, bank);
    }
    double *base_mem = ctx.d_resident_mem[bank];
    int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
    double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY];
    int active_count = 0;

    for (int i = 0; i < subset_count; ++i) {
        const int state_index = state_indices[i];
        if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) continue;
        if (kind == cudaMemcpyDeviceToHost &&
            ctx.resident_host_clean[bank][state_index])
            continue;
        if (!state_host[i]) continue;
        active_state_indices[active_count] = state_index;
        active_state_host[active_count] = state_host[i];
        ++active_count;
    }
    if (active_count <= 0) return;

    const size_t total_doubles = (size_t)active_count * all;
    double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
    double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles);

    CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices,
                                  (size_t)active_count * sizeof(int),
                                  0, cudaMemcpyHostToDevice));
    if (kind == cudaMemcpyDeviceToHost) {
        dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
        kern_pack_state_subset<<<launch_grid, BLK>>>(
            base_mem, d_comm, active_count, (int)all);
        CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
                              total_doubles * sizeof(double),
                              cudaMemcpyDeviceToHost));
        for (int i = 0; i < active_count; ++i) {
            std::memcpy(active_state_host[i],
                        h_comm + (size_t)i * all,
                        bytes);
        }
        mark_resident_host_subset_clean(ctx, bank, active_count,
                                        active_state_indices, true);
    } else {
        for (int i = 0; i < active_count; ++i) {
            std::memcpy(h_comm + (size_t)i * all,
                        active_state_host[i],
                        bytes);
        }
        CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
                              total_doubles * sizeof(double),
                              cudaMemcpyHostToDevice));
        dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
        kern_unpack_state_subset<<<launch_grid, BLK>>>(
            base_mem, d_comm, active_count, (int)all);
        ctx.resident_valid[bank] = true;
        ctx.resident_age[bank] = ++ctx.resident_clock;
        mark_resident_current_bank(ctx, bank);
        mark_resident_host_subset_clean(ctx, bank, active_count,
                                        active_state_indices, true);
        update_state_ready(ctx);
    }
}

static bool has_resident_state(void *block_tag)
{
    auto it = g_step_ctx.find(block_tag);
    return it != g_step_ctx.end() && any_resident_bank_valid(it->second);
}

/* ================================================================== */
/*  Main host function — drop-in replacement for bssn_rhs_c.C          */
/* ================================================================== */

extern "C"
int f_compute_rhs_bssn(int *ex, double &T,
    double *X, double *Y, double *Z,
    double *chi, double *trK,
    double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
    double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
    double *Gamx, double *Gamy, double *Gamz,
    double *Lap, double *betax, double *betay, double *betaz,
    double *dtSfx, double *dtSfy, double *dtSfz,
    double *chi_rhs, double *trK_rhs,
    double *gxx_rhs, double *gxy_rhs, double *gxz_rhs,
    double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
    double *Axx_rhs, double *Axy_rhs, double *Axz_rhs,
    double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
    double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
    double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
    double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
    double *rho, double *Sx, double *Sy, double *Sz,
    double *Sxx, double *Sxy_m, double *Sxz, double *Syy, double *Syz_m, double *Szz,
    double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy,
    double *Gamxyz, double *Gamxzz,
    double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy,
    double *Gamyyz, double *Gamyzz,
    double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy,
    double *Gamzyz, double *Gamzzz,
    double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
    double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
    double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
    int &Symmetry, int &Lev, double &eps, int &co)
{
    /* --- Multi-GPU: select device --- */
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));

    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const double SYM = 1.0, ANTI = -1.0;

    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);

    /* --- Shorthand for device slot pointers --- */
    #define D(s) g_buf.slot[s]
    const size_t bytes = (size_t)all * sizeof(double);

    /* --- H2D: stage all inputs, then one bulk copy --- */
    double *h2d_src[] = {
        chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz,
        Lap, betax, betay, betaz,
        dtSfx, dtSfy, dtSfz,
        rho, Sx, Sy, Sz,
        Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz
    };
    static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT,
                  "h2d_src list must match H2D_INPUT_SLOT_COUNT");
    for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) {
        std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes);
    }
    CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage,
                          (size_t)H2D_INPUT_SLOT_COUNT * bytes,
                          cudaMemcpyHostToDevice));

    /* ============================================================ */
    /*  Phase 1: prep — alpn1, chin1, gxx, gyy, gzz                 */
    /* ============================================================ */
    kern_phase1_prep<<<grid(all),BLK>>>(
        D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
        D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));

    /* 12x fderivs */
    gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
    gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
    gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
    gpu_fderivs(D(S_chi),   D(S_chix),D(S_chiy),D(S_chiz),       SYM,SYM,SYM, all);
    gpu_fderivs(D(S_dxx),   D(S_gxxx),D(S_gxxy),D(S_gxxz),       SYM,SYM,SYM, all);
    gpu_fderivs(D(S_gxy),   D(S_gxyx),D(S_gxyy),D(S_gxyz),       ANTI,ANTI,SYM, all);
    gpu_fderivs(D(S_gxz),   D(S_gxzx),D(S_gxzy),D(S_gxzz),       ANTI,SYM,ANTI, all);
    gpu_fderivs(D(S_dyy),   D(S_gyyx),D(S_gyyy),D(S_gyyz),       SYM,SYM,SYM, all);
    gpu_fderivs(D(S_gyz),   D(S_gyzx),D(S_gyzy),D(S_gyzz),       SYM,ANTI,ANTI, all);
    gpu_fderivs(D(S_dzz),   D(S_gzzx),D(S_gzzy),D(S_gzzz),       SYM,SYM,SYM, all);
    gpu_fderivs(D(S_Lap),   D(S_Lapx),D(S_Lapy),D(S_Lapz),       SYM,SYM,SYM, all);
    gpu_fderivs(D(S_trK),   D(S_Kx),D(S_Ky),D(S_Kz),             SYM,SYM,SYM, all);

    /* ============================================================ */
    /*  Phase 2: metric RHS + inverse                                */
    /* ============================================================ */
    kern_phase2_metric_rhs<<<grid(all),BLK>>>(
        D(S_alpn1), D(S_chin1),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_trK),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        D(S_betaxx), D(S_betaxy), D(S_betaxz),
        D(S_betayx), D(S_betayy), D(S_betayz),
        D(S_betazx), D(S_betazy), D(S_betazz),
        D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
        D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));

    kern_phase2_inverse<<<grid(all),BLK>>>(
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz));

    /* Phase 3: Gamma constraint (co==0) */
    if (co == 0) {
        kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
            D(S_Gamx), D(S_Gamy), D(S_Gamz),
            D(S_gupxx), D(S_gupxy), D(S_gupxz),
            D(S_gupyy), D(S_gupyz), D(S_gupzz),
            D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
            D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
            D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
            D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
    }

    /* Phase 4: Christoffel symbols */
    kern_phase4_christoffel<<<grid(all),BLK>>>(
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
        D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
        D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));

    /* Phase 5+6: raise A in registers, then build Gamma_rhs part 1 */
    kern_phase5_6_gamma_rhs_part1_fused<<<grid(all),BLK>>>(
        D(S_Lapx), D(S_Lapy), D(S_Lapz),
        D(S_alpn1), D(S_chin1),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        D(S_Kx), D(S_Ky), D(S_Kz),
        D(S_Sx), D(S_Sy), D(S_Sz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));

    /* Phase 7: fdderivs(beta) + fderivs(Gamma) */
    gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
                 D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
    gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
                 D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
    gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
                 D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
    gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
    gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
    gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);

    /* Phase 8: Gamma_rhs part 2 */
    kern_phase8_9_gamma_rhs_contract_fused<<<grid(all),BLK>>>(
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
        D(S_betaxx),D(S_betaxy),D(S_betaxz),
        D(S_betayx),D(S_betayy),D(S_betayz),
        D(S_betazx),D(S_betazy),D(S_betazz),
        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
        D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
        D(S_Gamxa),D(S_Gamya),D(S_Gamza),
        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));

    /* Phase 10: 6x fdderivs(metric) + Ricci contract */
    {
        double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
        double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
        const int soa_signs[] = {
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)SYM,  (int)SYM,  (int)SYM,
            (int)ANTI, (int)ANTI, (int)SYM,
            (int)ANTI, (int)SYM,  (int)ANTI,
            (int)SYM,  (int)ANTI, (int)ANTI
        };
        gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
                                D(S_gupyy), D(S_gupyz), D(S_gupzz),
                                src_fields, dst_fields, soa_signs, all);
    }

    /* Phase 11: fused Ricci assembly */
    kern_phase11_ricci_fused<<<grid(all),BLK>>>(
        D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
        D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
        D(S_Gamxa),D(S_Gamya),D(S_Gamza),
        D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
        D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
        D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
        D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
        D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
        D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
        D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
        D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
        D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
        D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
        D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
        D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
        D(S_Rxx),D(S_Rxy),D(S_Rxz),
        D(S_Ryy),D(S_Ryz),D(S_Rzz));

    /* ============================================================ */
    /*  Phase 12/13: chi fdderivs + chi correction                   */
    /* ============================================================ */
    kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
        D(S_chi), D(S_chin1),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
        D(S_Rxx), D(S_Rxy), D(S_Rxz),
        D(S_Ryy), D(S_Ryz), D(S_Rzz));

    /* ============================================================ */
    /*  Phase 14/15: fused trK_rhs, Aij_rhs, gauge                   */
    /* ============================================================ */
    kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
        D(S_alpn1), D(S_chin1),
        D(S_chix), D(S_chiy), D(S_chiz),
        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
        D(S_gupxx), D(S_gupxy), D(S_gupxz),
        D(S_gupyy), D(S_gupyz), D(S_gupzz),
        D(S_trK),
        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
        D(S_Lapx), D(S_Lapy), D(S_Lapz),
        D(S_betaxx), D(S_betaxy), D(S_betaxz),
        D(S_betayx), D(S_betayy), D(S_betayz),
        D(S_betazx), D(S_betazy), D(S_betazz),
        D(S_rho),
        D(S_Sx), D(S_Sy), D(S_Sz),
        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
        D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
        D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
        D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
        D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
        D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
        D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
        D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
        D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
        D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
        D(S_trK_rhs),
        D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
        D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
        D(S_Lap_rhs),
        D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
        D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
        D(S_f_arr), D(S_S_arr));

    /* ============================================================ */
    /*  Phase 16/17: advection + KO dissipation (shared ord=3 pack)  */
    /* ============================================================ */
    gpu_lopsided_kodis_state_batch(eps, all);

    /* ============================================================ */
    /*  Phase 18: Hamilton & momentum constraints (co==0)             */
    /* ============================================================ */
    if (co == 0) {
        /* 6x fderivs on Aij — reuse gxxx..gzzz slots for dA/dx output */
        gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM,      all);
        gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM,     all);
        gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI,     all);
        gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM,       all);
        gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI,     all);
        gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM,       all);

        kern_phase18_constraints<<<grid(all),BLK>>>(
            D(S_chin1),
            D(S_chix), D(S_chiy), D(S_chiz),
            D(S_gupxx), D(S_gupxy), D(S_gupxz),
            D(S_gupyy), D(S_gupyz), D(S_gupzz),
            D(S_trK),
            D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
            D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
            D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
            D(S_Kx), D(S_Ky), D(S_Kz),
            D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
            D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
            D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
            D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
            D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
            D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
            /* dA/dx arrays */
            D(S_gxxx), D(S_gxxy), D(S_gxxz),
            D(S_gxyx), D(S_gxyy), D(S_gxyz),
            D(S_gxzx), D(S_gxzy), D(S_gxzz),
            D(S_gyyx), D(S_gyyy), D(S_gyyz),
            D(S_gyzx), D(S_gyzy), D(S_gyzz),
            D(S_gzzx), D(S_gzzy), D(S_gzzz),
            D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
    }

    /* ============================================================ */
    /*  D2H: copy all output arrays back to host                     */
    /* ============================================================ */
    const int d2h_slot_count = D2H_BASE_SLOT_COUNT +
                               ((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0);
    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs),
                          (size_t)d2h_slot_count * bytes,
                          cudaMemcpyDeviceToHost));

    double *d2h_dst[] = {
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs,
        Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz
    };
    static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT,
                  "d2h_dst list must match D2H_BASE_SLOT_COUNT");
    for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) {
        std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes);
    }
    if (co == 0) {
        double *d2h_dst_co[] = {
            ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res
        };
        static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) ==
                          D2H_CONSTRAINT_SLOT_COUNT,
                      "d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT");
        for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) {
            std::memcpy(d2h_dst_co[s],
                        g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all,
                        bytes);
        }
    }

    #undef D
    return 0;
}

extern "C"
int bssn_cuda_rk4_substep(void *block_tag,
                          int *ex, double *X, double *Y, double *Z,
                          double **state_host_in,
                          double **state_host_out,
                          double **matter_host,
                          const double *propspeed,
                          const double *soa_flat,
                          const double *bbox,
                          double &dT,
                          double &T,
                          int &RK4,
                          int &apply_bam_bc,
                          int &Symmetry,
                          int &Lev,
                          double &eps,
                          int &co,
                          int &use_zero_matter,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny)
{
    (void)T;

    if (RK4 < 0 || RK4 > 3) return 1;

    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    const bool profile = cuda_profile_enabled();
    const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
    double state_ms = 0.0;
    double matter_ms = 0.0;
    double rhs_ms = 0.0;
    double bc_ms = 0.0;
    double finalize_ms = 0.0;
    double output_ms = 0.0;

    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    int touch_xmin = 0, touch_xmax = 0;
    int touch_ymin = 0, touch_ymax = 0;
    int touch_zmin = 0, touch_zmax = 0;

    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
    if (Lev > 0) {
        compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
                                     touch_xmin, touch_xmax,
                                     touch_ymin, touch_ymax,
                                     touch_zmin, touch_zmax);
    }
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    const bool use_resident_state = (keep_resident_state != 0);
    int input_bank = -1;
    int output_bank = -1;
    if (use_resident_state) {
        input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
        output_bank = reserve_resident_output_bank(ctx, state_host_out, all, input_bank);
        mark_resident_current_bank(ctx, input_bank);
        mark_resident_next_bank(ctx, output_bank);
        bind_state_input_slots(ctx.d_resident[input_bank]);
        bind_state_output_slots(ctx.d_resident[output_bank]);
    }
    double t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (!use_resident_state) {
        upload_state_inputs(state_host_in, all);
    }
    if (apply_enforce_ga) {
        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
        if (use_resident_state && input_bank >= 0)
            set_resident_host_clean(ctx, input_bank, false);
    }
    if (profile) {
        cuda_profile_sync();
        state_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (RK4 == 0) {
        if (use_zero_matter) {
            if (!ctx.matter_ready) zero_matter_cache(ctx, all);
        } else {
            upload_matter_cache(ctx, matter_host, all);
        }
        const double *state0_src = use_resident_state
                                 ? ctx.d_resident_mem[input_bank]
                                 : g_buf.slot[S_chi];
        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, state0_src,
                              (size_t)BSSN_STATE_COUNT * bytes,
                              cudaMemcpyDeviceToDevice));
    } else if (use_zero_matter) {
        if (!ctx.matter_ready) zero_matter_cache(ctx, all);
    } else {
        upload_matter_cache(ctx, matter_host, all);
    }
    bind_matter_slots(ctx);
    if (profile) {
        cuda_profile_sync();
        matter_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    launch_rhs_pipeline((int)all, eps, co);
    if (profile) {
        cuda_profile_sync();
        rhs_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (apply_bam_bc) {
        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
            gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
                                   g_buf.slot[k_state_rhs_slots[i]],
                                   propspeed[i],
                                   soa_flat[3 * i + 0],
                                   soa_flat[3 * i + 1],
                                   soa_flat[3 * i + 2],
                                   X, Y, Z, bbox, Symmetry);
        }
    }
    if (profile) {
        cuda_profile_sync();
        bc_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
    if (Lev > 0) {
        gpu_copy_patch_boundary_batch((int)all,
                                      touch_xmin, touch_xmax,
                                      touch_ymin, touch_ymax,
                                      touch_zmin, touch_zmax);
    }
    if (profile) {
        cuda_profile_sync();
        finalize_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (use_resident_state) {
        ctx.resident_valid[output_bank] = true;
        ctx.resident_age[output_bank] = ++ctx.resident_clock;
        set_resident_host_clean(ctx, output_bank, false);
        mark_resident_current_bank(ctx, output_bank);
        update_state_ready(ctx);
    } else {
        download_state_outputs(state_host_out, all);
    }
    if (RK4 == 3) {
        ctx.matter_ready = false;  /* invalidate matter cache for next timestep */
    }
    if (profile) {
        cuda_profile_sync();
        output_ms += cuda_profile_now_ms() - t0;
        CudaProfileStats &stats = cuda_profile_stats();
        stats.calls++;
        stats.total_ms += cuda_profile_now_ms() - t_total0;
        stats.state_ms += state_ms;
        stats.matter_ms += matter_ms;
        stats.rhs_ms += rhs_ms;
        stats.bc_ms += bc_ms;
        stats.finalize_ms += finalize_ms;
        stats.output_ms += output_ms;
        cuda_profile_maybe_log();
    }
    return 0;
}

extern "C"
int bssn_em_cuda_rk4_substep(void *block_tag,
                             int *ex, double *X, double *Y, double *Z,
                             double **state_host_in,
                             double **state_host_out,
                             double **source_host,
                             const double *propspeed,
                             const double *soa_flat,
                             const double *bbox,
                             double &dT,
                             double &T,
                             int &RK4,
                             int &apply_bam_bc,
                             int &Symmetry,
                             int &Lev,
                             double &eps,
                             int &co,
                             int &keep_resident_state,
                             int &apply_enforce_ga,
                             double &chitiny)
{
    (void)T;
    if (RK4 < 0 || RK4 > 3) return 1;

    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    const bool profile = cuda_profile_enabled();
    const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
    double state_ms = 0.0;
    double em_ms = 0.0;
    double rhs_ms = 0.0;
    double bc_ms = 0.0;
    double finalize_ms = 0.0;
    double output_ms = 0.0;

    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    int touch_xmin = 0, touch_xmax = 0;
    int touch_ymin = 0, touch_ymax = 0;
    int touch_zmin = 0, touch_zmax = 0;

    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
    if (Lev > 0) {
        compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
                                     touch_xmin, touch_xmax,
                                     touch_ymin, touch_ymax,
                                     touch_zmin, touch_zmax);
    }

    StepContext &ctx = ensure_step_ctx(block_tag, all);
    double t0 = profile ? cuda_profile_now_ms() : 0.0;
    const bool use_resident_state = (keep_resident_state != 0);
    int input_bank = -1;
    int output_bank = -1;
    if (use_resident_state) {
        input_bank = ensure_em_resident_bank(ctx, state_host_in, all, true);
        output_bank = reserve_em_resident_output_bank(ctx, state_host_out, all, input_bank);
        mark_resident_current_bank(ctx, input_bank);
        mark_resident_next_bank(ctx, output_bank);
        bind_em_state_input_slots(ctx.d_resident[input_bank]);
        bind_em_state_output_slots(ctx.d_resident[output_bank]);
    } else {
        upload_em_state_inputs(state_host_in, all);
    }
    upload_em_fixed_sources(ctx, source_host, all);
    const bool use_em_zero_fast =
        use_resident_state && em_detect_zero_fast_path(ctx, input_bank, all);

    if (apply_enforce_ga) {
        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
        if (use_resident_state && input_bank >= 0)
            set_resident_host_clean(ctx, input_bank, false);
    }
    if (profile) {
        cuda_profile_sync();
        state_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (RK4 == 0) {
        if (use_resident_state) {
            CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank],
                                  (size_t)BSSN_EM_STATE_COUNT * bytes,
                                  cudaMemcpyDeviceToDevice));
        } else {
            for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
                CUDA_CHECK(cudaMemcpy(ctx.d_state0[i], g_buf.slot[k_em_state_input_slots[i]],
                                      bytes, cudaMemcpyDeviceToDevice));
            }
        }
    }
    if (profile) {
        cuda_profile_sync();
        state_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (use_em_zero_fast) {
        zero_matter_cache(ctx, all);
        bind_matter_slots(ctx);
        zero_em_output_slots_async(all);
    } else {
        gpu_em_rhs_sources((int)all, eps);
    }
    if (profile) {
        cuda_profile_sync();
        em_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    launch_rhs_pipeline((int)all, eps, co);
    if (profile) {
        cuda_profile_sync();
        rhs_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (apply_bam_bc) {
        for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
            gpu_sommerfeld_routbam(g_buf.slot[k_em_state_input_slots[i]],
                                   g_buf.slot[k_em_state_rhs_slots[i]],
                                   propspeed[i],
                                   soa_flat[3 * i + 0],
                                   soa_flat[3 * i + 1],
                                   soa_flat[3 * i + 2],
                                   X, Y, Z, bbox, Symmetry);
        }
    }
    if (profile) {
        cuda_profile_sync();
        bc_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (use_em_zero_fast)
        gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
    else
        gpu_em_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
    if (Lev > 0) {
        if (use_em_zero_fast)
            gpu_copy_patch_boundary_batch((int)all,
                                          touch_xmin, touch_xmax,
                                          touch_ymin, touch_ymax,
                                          touch_zmin, touch_zmax);
        else
            gpu_em_restore_patch_boundary_batch(ctx, (int)all,
                                                touch_xmin, touch_xmax,
                                                touch_ymin, touch_ymax,
                                                touch_zmin, touch_zmax);
    }
    if (profile) {
        cuda_profile_sync();
        finalize_ms += cuda_profile_now_ms() - t0;
    }

    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (use_resident_state) {
        ctx.resident_valid[output_bank] = true;
        ctx.resident_age[output_bank] = ++ctx.resident_clock;
        set_resident_host_clean(ctx, output_bank, false);
        mark_resident_current_bank(ctx, output_bank);
        update_state_ready(ctx);
    } else {
        download_em_state_outputs(state_host_out, all);
    }
    if (profile) {
        cuda_profile_sync();
        output_ms += cuda_profile_now_ms() - t0;
        CudaProfileStats &stats = cuda_profile_stats();
        stats.calls++;
        stats.total_ms += cuda_profile_now_ms() - t_total0;
        stats.state_ms += state_ms;
        stats.matter_ms += em_ms;
        stats.rhs_ms += rhs_ms;
        stats.bc_ms += bc_ms;
        stats.finalize_ms += finalize_ms;
        stats.output_ms += output_ms;
        cuda_profile_maybe_log();
    }
    if (RK4 == 3)
        ctx.matter_ready = false;
    return 0;
}

extern "C"
int bssn_em_cuda_resident_zero_fast_state(void *block_tag)
{
    auto it = g_step_ctx.find(block_tag);
    if (it == g_step_ctx.end())
        return 0;
    const StepContext &ctx = it->second;
    return (ctx.em_zero_fast_known && ctx.em_zero_fast) ? 1 : 0;
}

extern "C"
int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                        int state_index,
                                        double *host_state,
                                        int *ex,
                                        int i0, int j0, int k0,
                                        int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_cuda(block_tag, state_index, host_state, ex,
                           i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
    return 0;
}

extern "C"
int bssn_cuda_copy_state_region_from_host(void *block_tag,
                                          int state_index,
                                          double *host_state,
                                          int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_cuda(block_tag, state_index, host_state, ex,
                           i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
    return 0;
}

extern "C"
int bssn_cuda_download_resident_state(void *block_tag,
                                      int *ex,
                                      double **state_host_out)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    download_resident_state(block_tag, ex, state_host_out);
    return 0;
}

extern "C"
int bssn_escalar_cuda_download_resident_state(void *block_tag,
                                              int *ex,
                                              double **state_host_out)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    download_resident_state_count(block_tag, ex, state_host_out, BSSN_ESCALAR_STATE_COUNT);
    return 0;
}

extern "C"
int bssn_cuda_upload_resident_state_count(void *block_tag,
                                          int *ex,
                                          double **state_host_in,
                                          int state_count)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return 1;
    upload_resident_state_count(block_tag, ex, state_host_in, state_count);
    return 0;
}

extern "C"
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
                                            int *ex,
                                            double **state_host_in)
{
    return bssn_cuda_upload_resident_state_count(block_tag, ex, state_host_in,
                                                BSSN_ESCALAR_STATE_COUNT);
}

extern "C"
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
                                             int *ex,
                                             double **state_host_key,
                                             int state_count)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return 1;
    keep_only_resident_state_count(block_tag, ex, state_host_key, state_count);
    return 0;
}

extern "C"
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
                                               int *ex,
                                               double **state_host_key)
{
    return bssn_cuda_keep_only_resident_state_count(block_tag, ex, state_host_key,
                                                   BSSN_ESCALAR_STATE_COUNT);
}

extern "C"
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
                                                       int *ex,
                                                       double **state_host_out,
                                                       int state_count)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return 1;
    download_resident_state_count_if_present(block_tag, ex, state_host_out, state_count);
    return 0;
}

extern "C"
int bssn_cuda_download_resident_state_if_present(void *block_tag,
                                                int *ex,
                                                double **state_host_out)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (!block_tag || !ex || !state_host_out) return 1;
    download_resident_state_if_present(block_tag, ex, state_host_out);
    return 0;
}

extern "C"
int bssn_cuda_download_constraint_outputs(int *ex,
                                          double **constraint_host_out)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    download_constraint_outputs(constraint_host_out, all);
    return 0;
}

extern "C"
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                               int state_index,
                                               double *host_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                  i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
    return 0;
}

extern "C"
int bssn_cuda_interp_state_point3(void *block_tag,
                                  int *ex,
                                  int state0,
                                  int state1,
                                  int state2,
                                  double x0,
                                  double y0,
                                  double z0,
                                  double dx,
                                  double dy,
                                  double dz,
                                  double px,
                                  double py,
                                  double pz,
                                  int ordn,
                                  int symmetry,
                                  double **state_host_key,
                                  const double *soa3,
                                  double *out3)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (!block_tag || !ex || !out3 || !soa3)
        return 1;
    if (state0 < 0 || state0 >= BSSN_RESIDENT_STATE_CAPACITY ||
        state1 < 0 || state1 >= BSSN_RESIDENT_STATE_CAPACITY ||
        state2 < 0 || state2 >= BSSN_RESIDENT_STATE_CAPACITY)
        return 1;
    if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
        ordn <= 0 || ordn > 8 ||
        ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
        return 1;

    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int interp_states[3] = {state0, state1, state2};
    const int bank = find_resident_bank_subset(ctx, state_host_key, interp_states, 3);
    if (bank < 0 || !ctx.resident_valid[bank])
        return 1;

    double *d_out = ensure_step_comm_buffer(ctx, 3);
    kern_interp_state_point3<<<1, 3>>>(
        ctx.d_resident_mem[bank], d_out,
        ex[0], ex[1], ex[2], (int)all,
        state0, state1, state2,
        x0, y0, z0, dx, dy, dz,
        px, py, pz, ordn, symmetry,
        soa3[0], soa3[1], soa3[2],
        soa3[3], soa3[4], soa3[5],
        soa3[6], soa3[7], soa3[8]);
    CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
    return 0;
}

extern "C"
int bssn_cuda_interp_host_two_fields(void *block_tag,
                                     int *ex,
                                     double *field0,
                                     double *field1,
                                     double x0,
                                     double y0,
                                     double z0,
                                     double dx,
                                     double dy,
                                     double dz,
                                     const double *px,
                                     const double *py,
                                     const double *pz,
                                     int npoints,
                                     int ordn,
                                     int symmetry,
                                     const double *soa6,
                                     double *out_interleaved)
{
    (void)block_tag;
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 ||
        !out_interleaved || npoints <= 0)
        return 1;
    if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
        ordn <= 0 || ordn > 8 ||
        ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
        return 1;

    const int all = ex[0] * ex[1] * ex[2];
    const size_t field_bytes = (size_t)all * sizeof(double);
    const size_t point_bytes = (size_t)npoints * sizeof(double);
    const size_t out_bytes = (size_t)2 * npoints * sizeof(double);

    double *d_field0 = nullptr;
    double *d_field1 = nullptr;
    double *d_px = nullptr;
    double *d_py = nullptr;
    double *d_pz = nullptr;
    double *d_out = nullptr;
    CUDA_CHECK(cudaMalloc(&d_field0, field_bytes));
    CUDA_CHECK(cudaMalloc(&d_field1, field_bytes));
    CUDA_CHECK(cudaMalloc(&d_px, point_bytes));
    CUDA_CHECK(cudaMalloc(&d_py, point_bytes));
    CUDA_CHECK(cudaMalloc(&d_pz, point_bytes));
    CUDA_CHECK(cudaMalloc(&d_out, out_bytes));

    CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice));

    const int threads = 256;
    const int blocks = (npoints + threads - 1) / threads;
    kern_interp_host_two_fields<<<blocks, threads>>>(
        d_field0, d_field1, d_px, d_py, d_pz, d_out,
        ex[0], ex[1], ex[2], all,
        x0, y0, z0, dx, dy, dz,
        npoints, ordn, symmetry,
        soa6[0], soa6[1], soa6[2],
        soa6[3], soa6[4], soa6[5]);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost));

    cudaFree(d_out);
    cudaFree(d_pz);
    cudaFree(d_py);
    cudaFree(d_px);
    cudaFree(d_field1);
    cudaFree(d_field0);
    return 0;
}

__global__ void kern_shell_pack_host_fields(double **fields,
                                            const int *block_shapes,
                                            const int *point_block,
                                            const int *point_dimh,
                                            const int *point_dumyd,
                                            const int *point_sind,
                                            const double *point_coef,
                                            double *out,
                                            int npoints,
                                            int nvars,
                                            int ordn)
{
    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    const int total = npoints * nvars;
    if (tid >= total) return;

    const int p = tid / nvars;
    const int v = tid - p * nvars;
    const int b = point_block[p];
    const int *shape = block_shapes + 3 * b;
    const int *s = point_sind + 3 * p;
    const double *coef = point_coef + 3 * ordn * p;
    const double *f = fields[b * nvars + v];

    const int nx = shape[0];
    const int ny = shape[1];
    const int nz = shape[2];
    const int dimh = point_dimh[p];
    const int dumyd = point_dumyd[p];
    double sum = 0.0;

    if (dimh == 3) {
        const double *cx = coef;
        const double *cy = coef + ordn;
        const double *cz = coef + 2 * ordn;
        for (int kk = 0; kk < ordn; ++kk)
            for (int jj = 0; jj < ordn; ++jj)
                for (int ii = 0; ii < ordn; ++ii) {
                    const int idx = (s[0] + ii) + nx * ((s[1] + jj) + ny * (s[2] + kk));
                    sum += cx[ii] * cy[jj] * cz[kk] * f[idx];
                }
    } else if (dimh == 1 && dumyd == 1) {
        for (int ii = 0; ii < ordn; ++ii) {
            const int idx = (s[0] + ii) + nx * (s[1] + ny * s[2]);
            sum += coef[ii] * f[idx];
        }
    } else if (dimh == 1 && dumyd == 0) {
        for (int jj = 0; jj < ordn; ++jj) {
            const int idx = s[1] + nx * ((s[0] + jj) + ny * s[2]);
            sum += coef[jj] * f[idx];
        }
    }

    out[tid] = sum;
}

struct ShellPackCachedField {
    double *device;
    size_t bytes;
    int generation;
};

static std::unordered_map<const double *, ShellPackCachedField> g_shell_pack_cache;
static int g_shell_pack_generation = 0;

extern "C"
void bssn_cuda_shell_pack_cache_begin()
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    for (auto &kv : g_shell_pack_cache)
        cudaFree(kv.second.device);
    g_shell_pack_cache.clear();
    ++g_shell_pack_generation;
}

extern "C"
void bssn_cuda_shell_pack_cache_end()
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    for (auto &kv : g_shell_pack_cache)
        cudaFree(kv.second.device);
    g_shell_pack_cache.clear();
}

extern "C"
int bssn_cuda_shell_pack_host_fields(int npoints,
                                     int nvars,
                                     int nblocks,
                                     int ordn,
                                     double **block_var_fields,
                                     int *block_shapes,
                                     int *point_block,
                                     int *point_dimh,
                                     int *point_dumyd,
                                     int *point_sind,
                                     double *point_coef,
                                     double *out)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (npoints <= 0 || nvars <= 0 || nblocks <= 0 || ordn <= 0 || ordn > 8 ||
        !block_var_fields || !block_shapes || !point_block || !point_dimh ||
        !point_dumyd || !point_sind || !point_coef || !out)
        return 1;

    const int field_count = nblocks * nvars;
    std::vector<double *> h_device_fields((size_t)field_count, nullptr);
    double **d_fields = nullptr;
    int *d_block_shapes = nullptr;
    int *d_point_block = nullptr;
    int *d_point_dimh = nullptr;
    int *d_point_dumyd = nullptr;
    int *d_point_sind = nullptr;
    double *d_point_coef = nullptr;
    double *d_out = nullptr;

    for (int b = 0; b < nblocks; ++b) {
        const size_t all = (size_t)block_shapes[3 * b] *
                           (size_t)block_shapes[3 * b + 1] *
                           (size_t)block_shapes[3 * b + 2];
        const size_t bytes = all * sizeof(double);
        for (int v = 0; v < nvars; ++v) {
            const int idx = b * nvars + v;
            double *host_ptr = block_var_fields[idx];
            if (!host_ptr) return 1;
            auto it = g_shell_pack_cache.find(host_ptr);
            if (it != g_shell_pack_cache.end() &&
                it->second.bytes == bytes &&
                it->second.generation == g_shell_pack_generation) {
                h_device_fields[idx] = it->second.device;
            } else {
                double *device_ptr = nullptr;
                CUDA_CHECK(cudaMalloc(&device_ptr, bytes));
                CUDA_CHECK(cudaMemcpy(device_ptr, host_ptr, bytes, cudaMemcpyHostToDevice));
                g_shell_pack_cache[host_ptr] = {device_ptr, bytes, g_shell_pack_generation};
                h_device_fields[idx] = device_ptr;
            }
        }
    }

    CUDA_CHECK(cudaMalloc(&d_fields, (size_t)field_count * sizeof(double *)));
    CUDA_CHECK(cudaMemcpy(d_fields, h_device_fields.data(),
                          (size_t)field_count * sizeof(double *),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMalloc(&d_block_shapes, (size_t)nblocks * 3 * sizeof(int)));
    CUDA_CHECK(cudaMemcpy(d_block_shapes, block_shapes,
                          (size_t)nblocks * 3 * sizeof(int),
                          cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMalloc(&d_point_block, (size_t)npoints * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_point_dimh, (size_t)npoints * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_point_dumyd, (size_t)npoints * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_point_sind, (size_t)npoints * 3 * sizeof(int)));
    CUDA_CHECK(cudaMalloc(&d_point_coef, (size_t)npoints * 3 * ordn * sizeof(double)));
    CUDA_CHECK(cudaMalloc(&d_out, (size_t)npoints * nvars * sizeof(double)));

    CUDA_CHECK(cudaMemcpy(d_point_block, point_block,
                          (size_t)npoints * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_point_dimh, point_dimh,
                          (size_t)npoints * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_point_dumyd, point_dumyd,
                          (size_t)npoints * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_point_sind, point_sind,
                          (size_t)npoints * 3 * sizeof(int), cudaMemcpyHostToDevice));
    CUDA_CHECK(cudaMemcpy(d_point_coef, point_coef,
                          (size_t)npoints * 3 * ordn * sizeof(double),
                          cudaMemcpyHostToDevice));

    const int total = npoints * nvars;
    const int threads = 256;
    const int blocks = (total + threads - 1) / threads;
    kern_shell_pack_host_fields<<<blocks, threads>>>(
        d_fields, d_block_shapes, d_point_block, d_point_dimh,
        d_point_dumyd, d_point_sind, d_point_coef, d_out,
        npoints, nvars, ordn);
    CUDA_CHECK(cudaGetLastError());
    CUDA_CHECK(cudaMemcpy(out, d_out, (size_t)total * sizeof(double),
                          cudaMemcpyDeviceToHost));

    cudaFree(d_out);
    cudaFree(d_point_coef);
    cudaFree(d_point_sind);
    cudaFree(d_point_dumyd);
    cudaFree(d_point_dimh);
    cudaFree(d_point_block);
    cudaFree(d_block_shapes);
    cudaFree(d_fields);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                  i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  int state_index,
                                                                  double *host_buffer,
                                                                  int *ex,
                                                                  int i0, int j0, int k0,
                                                                  int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (!state_host_key ||
        state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return 1;
    copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                  i0, j0, k0, sx, sy, sz,
                                  cudaMemcpyHostToDevice,
                                  state_host_key, state_count);
    return 0;
}

extern "C"
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                              int state_count,
                                              double *host_buffer,
                                              int *ex,
                                              int i0, int j0, int k0,
                                              int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
                                        i0, j0, k0, sx, sy, sz,
                                        cudaMemcpyDeviceToHost);
    return 0;
}

extern "C"
int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                             double **state_host_key,
                                                             int state_count,
                                                             double *host_buffer,
                                                             int *ex,
                                                             int i0, int j0, int k0,
                                                             int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
                                        i0, j0, k0, sx, sy, sz,
                                        cudaMemcpyDeviceToHost,
                                        state_host_key);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                  int state_count,
                                                  double *host_buffer,
                                                  int *ex,
                                                  int i0, int j0, int k0,
                                                  int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
                                        i0, j0, k0, sx, sy, sz,
                                        cudaMemcpyHostToDevice);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *host_buffer,
                                                                 int *ex,
                                                                 int i0, int j0, int k0,
                                                                 int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
                                        i0, j0, k0, sx, sy, sz,
                                        cudaMemcpyHostToDevice,
                                        state_host_key);
    return 0;
}

static void copy_state_device_batch(void *block_tag,
                                    int state_count,
                                    double *device_buffer,
                                    const int *ex,
                                    int i0, int j0, int k0,
                                    int sx, int sy, int sz,
                                    int pack_not_unpack,
                                    double **state_host_key = nullptr)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    if (sx <= 0 || sy <= 0 || sz <= 0) return;

    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          pack_not_unpack == 0 || state_host_key != nullptr,
                                          state_count);
    double *base_mem = ctx.d_resident_mem[bank];
    const int region_all = sx * sy * sz;
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);

    if (pack_not_unpack) {
        kern_pack_state_region_batch<<<launch_grid, BLK>>>(
            base_mem, device_buffer,
            ex[0], ex[1], i0, j0, k0, sx, sy, sz,
            region_all, state_count,
            ex[0] * ex[1] * ex[2]);
    } else {
        kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
            base_mem, device_buffer,
            ex[0], ex[1], i0, j0, k0, sx, sy, sz,
            region_all, state_count,
            ex[0] * ex[1] * ex[2]);
        ctx.resident_valid[bank] = true;
        ctx.resident_age[bank] = ++ctx.resident_clock;
        mark_resident_current_bank(ctx, bank);
        set_resident_host_clean(ctx, bank, false);
        update_state_ready(ctx);
    }
}

static void copy_state_device_segments(void *block_tag,
                                       int state_count,
                                       double *device_buffer,
                                       const int *ex,
                                       int segment_count,
                                       const int *segment_meta,
                                       int pack_not_unpack,
                                       double **state_host_key = nullptr)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    if (segment_count <= 0 || !segment_meta) return;

    int max_region_all = 0;
    for (int s = 0; s < segment_count; ++s) {
        const int *m = segment_meta + s * 8;
        if (m[3] <= 0 || m[4] <= 0 || m[5] <= 0 || m[6] <= 0) return;
        if (m[6] > max_region_all) max_region_all = m[6];
    }
    if (max_region_all <= 0) return;

    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          pack_not_unpack == 0 || state_host_key != nullptr,
                                          state_count);
    double *base_mem = ctx.d_resident_mem[bank];
    int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
    CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
                          (size_t)segment_count * 8 * sizeof(int),
                          cudaMemcpyHostToDevice));

    dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
                     (unsigned int)state_count,
                     (unsigned int)segment_count);
    if (pack_not_unpack) {
        kern_pack_state_segments_batch<<<launch_grid, BLK>>>(
            base_mem, device_buffer,
            ex[0], ex[1], d_meta, state_count,
            ex[0] * ex[1] * ex[2]);
    } else {
        kern_unpack_state_segments_batch<<<launch_grid, BLK>>>(
            base_mem, device_buffer,
            ex[0], ex[1], d_meta, state_count,
            ex[0] * ex[1] * ex[2]);
        ctx.resident_valid[bank] = true;
        ctx.resident_age[bank] = ++ctx.resident_clock;
        mark_resident_current_bank(ctx, bank);
        set_resident_host_clean(ctx, bank, false);
        update_state_ready(ctx);
    }
}

static void restrict_state_device_segments(void *block_tag,
                                           int state_count,
                                           double *device_buffer,
                                           const int *ex,
                                           int segment_count,
                                           const int *segment_meta,
                                           double **state_host_key = nullptr,
                                           const double *state_soa = nullptr)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    if (segment_count <= 0 || !segment_meta || !device_buffer) return;

    int max_region_all = 0;
    for (int s = 0; s < segment_count; ++s) {
        const int *m = segment_meta + s * 8;
        if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return;
        if (m[3] > max_region_all) max_region_all = m[3];
    }
    if (max_region_all <= 0) return;

    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          state_host_key != nullptr,
                                          state_count);
    int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
    CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
                          (size_t)segment_count * 8 * sizeof(int),
                          cudaMemcpyHostToDevice));
    upload_comm_state_soa(state_soa, state_count);

    dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
                     (unsigned int)state_count,
                     (unsigned int)segment_count);
    kern_restrict_state_segments_batch<<<launch_grid, BLK>>>(
        ctx.d_resident_mem[bank], device_buffer,
        ex[0], ex[1], d_meta, state_count,
        ex[0] * ex[1] * ex[2]);
}

static void prolong_state_device_segments(void *block_tag,
                                          int state_count,
                                          double *device_buffer,
                                          const int *ex,
                                          int segment_count,
                                          const int *segment_meta,
                                          double **state_host_key = nullptr,
                                          const double *state_soa = nullptr)
{
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
    if (segment_count <= 0 || !segment_meta || !device_buffer) return;

    int max_region_all = 0;
    for (int s = 0; s < segment_count; ++s) {
        const int *m = segment_meta + s * 11;
        if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return;
        if (m[3] > max_region_all) max_region_all = m[3];
    }
    if (max_region_all <= 0) return;

    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all,
                                          state_host_key != nullptr,
                                          state_count);
    int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11);
    CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
                          (size_t)segment_count * 11 * sizeof(int),
                          cudaMemcpyHostToDevice));
    upload_comm_state_soa(state_soa, state_count);

    dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
                     (unsigned int)state_count,
                     (unsigned int)segment_count);
    kern_prolong_state_segments_batch<<<launch_grid, BLK>>>(
        ctx.d_resident_mem[bank], device_buffer,
        ex[0], ex[1], d_meta, state_count,
        ex[0] * ex[1] * ex[2]);
}

extern "C"
int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                                 int state_count,
                                                 double *device_buffer,
                                                 int *ex,
                                                 int i0, int j0, int k0,
                                                 int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
                            i0, j0, k0, sx, sy, sz, 1);
    return 0;
}

extern "C"
int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                               double **state_host_key,
                                                               int state_count,
                                                               double *device_buffer,
                                                               int *ex,
                                                               int i0, int j0, int k0,
                                                               int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
                            i0, j0, k0, sx, sy, sz, 1, state_host_key);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                     int state_count,
                                                     double *device_buffer,
                                                     int *ex,
                                                     int i0, int j0, int k0,
                                                     int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
                            i0, j0, k0, sx, sy, sz, 0);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int i0, int j0, int k0,
                                                                   int sx, int sy, int sz)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_batch(block_tag, state_count, device_buffer, ex,
                            i0, j0, k0, sx, sy, sz, 0, state_host_key);
    return 0;
}

extern "C"
int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int segment_count,
                                                   const int *segment_meta)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
                               segment_count, segment_meta, 1);
    return 0;
}

extern "C"
int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int segment_count,
                                                                  const int *segment_meta)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
                               segment_count, segment_meta, 1, state_host_key);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
                               segment_count, segment_meta, 0);
    return 0;
}

extern "C"
int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_device_segments(block_tag, state_count, device_buffer, ex,
                               segment_count, segment_meta, 0, state_host_key);
    return 0;
}

extern "C"
int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                       int state_count,
                                                       double *device_buffer,
                                                       int *ex,
                                                       int segment_count,
                                                       const int *segment_meta)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    restrict_state_device_segments(block_tag, state_count, device_buffer, ex,
                                   segment_count, segment_meta);
    return 0;
}

extern "C"
int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                      double **state_host_key,
                                                                      int state_count,
                                                                      double *device_buffer,
                                                                      int *ex,
                                                                      int segment_count,
                                                                      const int *segment_meta,
                                                                      const double *state_soa)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    restrict_state_device_segments(block_tag, state_count, device_buffer, ex,
                                   segment_count, segment_meta, state_host_key, state_soa);
    return 0;
}

extern "C"
int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
                                                      int *ex,
                                                      int segment_count,
                                                      const int *segment_meta)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    prolong_state_device_segments(block_tag, state_count, device_buffer, ex,
                                  segment_count, segment_meta);
    return 0;
}

extern "C"
int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta,
                                                                     const double *state_soa)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    prolong_state_device_segments(block_tag, state_count, device_buffer, ex,
                                  segment_count, segment_meta, state_host_key, state_soa);
    return 0;
}

extern "C"
int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                    int state_count,
                                                    double *device_buffer,
                                                    int *ex,
                                                    int sx, int sy, int sz,
                                                    int fi0, int fj0, int fk0)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const int region_all = sx * sy * sz;
    upload_comm_state_soa(nullptr, state_count);
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
        ctx.d_state_curr_mem, device_buffer,
        ex[0], ex[1], sx, sy, sz,
        fi0, fj0, fk0, region_all, state_count,
        ex[0] * ex[1] * ex[2]);
    return 0;
}

extern "C"
int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                   double **state_host_key,
                                                                   int state_count,
                                                                   double *device_buffer,
                                                                   int *ex,
                                                                   int sx, int sy, int sz,
                                                                   int fi0, int fj0, int fk0,
                                                                   const double *state_soa)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
    const int region_all = sx * sy * sz;
    upload_comm_state_soa(state_soa, state_count);
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
        ctx.d_resident_mem[bank], device_buffer,
        ex[0], ex[1], sx, sy, sz,
        fi0, fj0, fk0, region_all, state_count,
        ex[0] * ex[1] * ex[2]);
    return 0;
}

extern "C"
int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
                                                   int *ex,
                                                   int sx, int sy, int sz,
                                                   int ii0, int jj0, int kk0,
                                                   int lbc_i, int lbc_j, int lbc_k)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const int region_all = sx * sy * sz;
    upload_comm_state_soa(nullptr, state_count);
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
        ctx.d_state_curr_mem, device_buffer,
        ex[0], ex[1], sx, sy, sz,
        ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
        region_all, state_count,
        ex[0] * ex[1] * ex[2]);
    return 0;
}

extern "C"
int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int sx, int sy, int sz,
                                                                  int ii0, int jj0, int kk0,
                                                                  int lbc_i, int lbc_j, int lbc_k,
                                                                  const double *state_soa)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
    if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
    const int region_all = sx * sy * sz;
    upload_comm_state_soa(state_soa, state_count);
    dim3 launch_grid((unsigned int)grid((size_t)region_all),
                     (unsigned int)state_count);
    kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
        ctx.d_resident_mem[bank], device_buffer,
        ex[0], ex[1], sx, sy, sz,
        ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
        region_all, state_count,
        ex[0] * ex[1] * ex[2]);
    return 0;
}

extern "C"
int bssn_cuda_download_state_subset(void *block_tag,
                                    int *ex,
                                    int subset_count,
                                    const int *state_indices,
                                    double **state_host_out)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_out,
                      cudaMemcpyDeviceToHost);
    return 0;
}

extern "C"
int bssn_cuda_upload_state_subset(void *block_tag,
                                  int *ex,
                                  int subset_count,
                                  const int *state_indices,
                                  double **state_host_in)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_in,
                      cudaMemcpyHostToDevice);
    return 0;
}

extern "C"
int bssn_cuda_prepare_inter_time_level(void *block_tag,
                                       int *ex,
                                       int state_count,
                                       double **src1_host_key,
                                       double **src2_host_key,
                                       double **src3_host_key,
                                       double **dst_host_key,
                                       int source_count,
                                       int tindex)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    const bool profile = cuda_aux_profile_enabled();
    const double t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
        return 1;
    if (source_count != 2 && source_count != 3) return 1;
    if (!resident_key_usable_count(src1_host_key, state_count) ||
        !resident_key_usable_count(src2_host_key, state_count) ||
        !resident_key_usable_count(dst_host_key, state_count))
        return 1;
    if (source_count == 3 && !resident_key_usable_count(src3_host_key, state_count))
        return 1;

    double c1 = 0.0, c2 = 0.0, c3 = 0.0;
    if (source_count == 2) {
        if (tindex == 0) {
            c1 = 0.5; c2 = 0.5;
        } else if (tindex == 1) {
            c1 = 0.75; c2 = 0.25;
        } else if (tindex == -1) {
            c1 = 0.25; c2 = 0.75;
        } else {
            return 1;
        }
    } else {
        if (tindex == 0) {
            c1 = 3.0 / 8.0; c2 = 3.0 / 4.0; c3 = -1.0 / 8.0;
        } else if (tindex == 1 || tindex == -1) {
            c1 = 5.0 / 32.0; c2 = 15.0 / 16.0; c3 = -3.0 / 32.0;
        } else {
            return 1;
        }
    }

    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    int src1_bank, src2_bank, src3_bank, dst_bank;
    if (state_count == BSSN_ESCALAR_STATE_COUNT) {
        src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true);
        src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank);
        src3_bank = (source_count == 3)
                  ? ensure_escalar_resident_bank_avoiding(ctx, src3_host_key, all, true,
                                                          src1_bank, src2_bank, -1)
                  : -1;
        dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all,
                                                                 src1_bank, src2_bank, src3_bank);
    } else if (state_count == BSSN_EM_STATE_COUNT) {
        src1_bank = ensure_em_resident_bank(ctx, src1_host_key, all, true);
        src2_bank = ensure_em_resident_bank(ctx, src2_host_key, all, true, src1_bank);
        src3_bank = (source_count == 3)
                  ? ensure_em_resident_bank_avoiding(ctx, src3_host_key, all, true,
                                                     src1_bank, src2_bank, -1)
                  : -1;
        dst_bank = reserve_em_resident_output_bank_avoiding(ctx, dst_host_key, all,
                                                            src1_bank, src2_bank, src3_bank);
    } else {
        src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
        src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
        src3_bank = (source_count == 3)
                  ? ensure_resident_bank_avoiding(ctx, src3_host_key, all, true,
                                                  src1_bank, src2_bank, -1)
                  : -1;
        dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
                                                         src1_bank, src2_bank, src3_bank);
    }

    dim3 launch_grid((unsigned int)grid(all), (unsigned int)state_count);
    kern_prepare_inter_time_level<<<launch_grid, BLK>>>(
        ctx.d_resident_mem[src1_bank],
        ctx.d_resident_mem[src2_bank],
        (source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr,
        ctx.d_resident_mem[dst_bank],
        c1, c2, c3, state_count, (int)all);
    if (profile)
        cuda_profile_sync();
    ctx.resident_valid[dst_bank] = true;
    ctx.resident_age[dst_bank] = ++ctx.resident_clock;
    set_resident_host_clean(ctx, dst_bank, false);
    mark_resident_current_bank(ctx, dst_bank);
    update_state_ready(ctx);
    if (profile) {
        CudaAuxProfileStats &stats = cuda_aux_profile_stats();
        stats.prepare_calls++;
        stats.prepare_ms += cuda_profile_now_ms() - t0;
        cuda_aux_profile_maybe_log();
    }
    return 0;
}

extern "C"
int bssn_cuda_has_resident_state(void *block_tag)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    return has_resident_state(block_tag) ? 1 : 0;
}

extern "C"
void bssn_cuda_release_step_ctx(void *block_tag)
{
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    release_step_ctx(block_tag);
}