AMSS-NCKU/AMSS_NCKU_source/bssn_cuda_ops.cu

#include "bssn_cuda_ops.h"
#include "bssn_gpu.h"
#include "rungekutta4_rout.h"
#include "sommerfeld_rout.h"

#include <cmath>
#include <cstdio>
#include <cuda_runtime.h>
#include <unordered_map>
#include <vector>

namespace {

inline void report_cuda_error(const char *where, cudaError_t err)
{
  if (err != cudaSuccess)
    std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
}

inline int count_points(const int ex[3])
{
  return ex[0] * ex[1] * ex[2];
}

inline int div_up(int a, int b)
{
  return (a + b - 1) / b;
}

struct DeviceArrays
{
  double *x = nullptr;
  double *y = nullptr;
  double *z = nullptr;
  double *a = nullptr;
  double *b = nullptr;
  double *c = nullptr;
  double *d = nullptr;
};

struct CachedBuffer
{
  double *ptr = nullptr;
  size_t capacity = 0;
};

struct CachedIntBuffer
{
  int *ptr = nullptr;
  size_t capacity = 0;
};

struct CachedPtrBuffer
{
  void *ptr = nullptr;
  size_t capacity = 0;
};

inline void release_buffer(CachedBuffer &buffer)
{
  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
  }
  buffer.capacity = 0;
}

inline void release_buffer(CachedIntBuffer &buffer)
{
  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
  }
  buffer.capacity = 0;
}

inline void release_buffer(CachedPtrBuffer &buffer)
{
  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
  }
  buffer.capacity = 0;
}

inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
{
  if (bytes <= buffer.capacity && buffer.ptr)
    return true;

  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
    buffer.capacity = 0;
  }

  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }

  buffer.capacity = bytes;
  return true;
}

inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
{
  if (bytes <= buffer.capacity && buffer.ptr)
    return true;

  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
    buffer.capacity = 0;
  }

  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }

  buffer.capacity = bytes;
  return true;
}

inline bool ensure_capacity(CachedPtrBuffer &buffer, size_t bytes)
{
  if (bytes <= buffer.capacity && buffer.ptr)
    return true;

  if (buffer.ptr)
  {
    cudaError_t free_err = cudaFree(buffer.ptr);
    if (free_err != cudaSuccess)
      report_cuda_error("cudaFree", free_err);
    buffer.ptr = nullptr;
    buffer.capacity = 0;
  }

  cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMalloc", err);
    return false;
  }

  buffer.capacity = bytes;
  return true;
}

struct Rk4VarCache
{
  CachedBuffer X, Y, Z;
  CachedBuffer state0, boundary, stage, rhs;
  const double *host_X = nullptr;
  const double *host_Y = nullptr;
  const double *host_Z = nullptr;
  const double *host_state0 = nullptr;
  double *host_rhs = nullptr;
  int nx = 0;
  int ny = 0;
  int nz = 0;
  bool rhs_resident = false;
};

struct InterpStencilCacheEntry
{
  const double *X = nullptr;
  const double *Y = nullptr;
  const double *Z = nullptr;
  const double *px = nullptr;
  const double *py = nullptr;
  const double *pz = nullptr;
  int nx = 0;
  int ny = 0;
  int nz = 0;
  int num_points = 0;
  int ordn = 0;
  int symmetry = 0;
  bool valid = false;
  CachedBuffer weights;
  CachedIntBuffer indices;
  CachedIntBuffer reflect;
};

struct InterpBatchCache
{
  CachedBuffer out;
  CachedBuffer soa;
  CachedBuffer field_ptrs;
  CachedIntBuffer error_flag;
  std::vector<CachedBuffer> host_field_copies;
  InterpStencilCacheEntry stencil_entry;
};

struct Rk4BatchCache
{
  CachedPtrBuffer state0_ptrs;
  CachedPtrBuffer stage_ptrs;
  CachedPtrBuffer rhs_ptrs;
};

std::unordered_map<const double *, Rk4VarCache> &rk4_var_cache_map()
{
  static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
  return cache_map;
}

InterpBatchCache &interp_batch_cache()
{
  static thread_local InterpBatchCache cache;
  return cache;
}

Rk4BatchCache &rk4_batch_cache()
{
  static thread_local Rk4BatchCache cache;
  return cache;
}

inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry)
{
  release_buffer(entry.weights);
  release_buffer(entry.indices);
  release_buffer(entry.reflect);
  entry.X = nullptr;
  entry.Y = nullptr;
  entry.Z = nullptr;
  entry.px = nullptr;
  entry.py = nullptr;
  entry.pz = nullptr;
  entry.nx = 0;
  entry.ny = 0;
  entry.nz = 0;
  entry.num_points = 0;
  entry.ordn = 0;
  entry.symmetry = 0;
  entry.valid = false;
}

inline void release_interp_batch_cache(InterpBatchCache &cache)
{
  release_buffer(cache.out);
  release_buffer(cache.soa);
  release_buffer(cache.field_ptrs);
  release_buffer(cache.error_flag);
  for (size_t i = 0; i < cache.host_field_copies.size(); ++i)
    release_buffer(cache.host_field_copies[i]);
  cache.host_field_copies.clear();
  release_interp_stencil_cache(cache.stencil_entry);
}

inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D int)", err);
    return false;
  }
  return true;
}

inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
}

inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
{
  if (!ensure_capacity(dst, bytes))
    return false;

  const double *device_src = bssn_gpu_find_device_buffer(src);
  cudaMemcpyKind kind = cudaMemcpyHostToDevice;
  const void *copy_src = src;
  if (device_src)
  {
    copy_src = device_src;
    kind = cudaMemcpyDeviceToDevice;
  }

  cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
  if (err != cudaSuccess)
  {
    report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
    return false;
  }
  return true;
}

inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
{
  const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
  if (!device_ptr)
    return true;

  bssn_gpu_prepare_host_buffer(host_ptr, count);
  const size_t bytes = static_cast<size_t>(count) * sizeof(double);
  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error(label, err);
    return false;
  }
  return true;
}

inline bool copy_region_to_padded_stage(CachedBuffer &dst,
                                        const double *src,
                                        const int src_shape[3],
                                        int src_i0, int src_j0, int src_k0,
                                        int region_nx, int region_ny, int region_nz,
                                        int dst_sx, int dst_sy, int dst_sz,
                                        int dst_x0, int dst_y0, int dst_z0,
                                        const char *where)
{
  if (!src || !where || region_nx <= 0 || region_ny <= 0 || region_nz <= 0 ||
      dst_sx <= 0 || dst_sy <= 0 || dst_sz <= 0)
  {
    return false;
  }

  const size_t dst_bytes =
      static_cast<size_t>(dst_sx) * static_cast<size_t>(dst_sy) * static_cast<size_t>(dst_sz) * sizeof(double);
  if (!ensure_capacity(dst, dst_bytes))
    return false;

  const double *device_src = bssn_gpu_find_device_buffer(src);
  const void *copy_src = device_src ? static_cast<const void *>(device_src) : static_cast<const void *>(src);
  cudaMemcpyKind kind = device_src ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;

  if (!device_src)
  {
    const int src_count = src_shape[0] * src_shape[1] * src_shape[2];
    bssn_gpu_prepare_host_buffer(src, src_count);
  }

  cudaMemcpy3DParms parms = {};
  parms.srcPtr = make_cudaPitchedPtr(const_cast<void *>(copy_src),
                                     static_cast<size_t>(src_shape[0]) * sizeof(double),
                                     static_cast<size_t>(src_shape[0]),
                                     static_cast<size_t>(src_shape[1]));
  parms.dstPtr = make_cudaPitchedPtr(dst.ptr,
                                     static_cast<size_t>(dst_sx) * sizeof(double),
                                     static_cast<size_t>(dst_sx),
                                     static_cast<size_t>(dst_sy));
  parms.srcPos = make_cudaPos(static_cast<size_t>(src_i0) * sizeof(double),
                              static_cast<size_t>(src_j0),
                              static_cast<size_t>(src_k0));
  parms.dstPos = make_cudaPos(static_cast<size_t>(dst_x0) * sizeof(double),
                              static_cast<size_t>(dst_y0),
                              static_cast<size_t>(dst_z0));
  parms.extent = make_cudaExtent(static_cast<size_t>(region_nx) * sizeof(double),
                                 static_cast<size_t>(region_ny),
                                 static_cast<size_t>(region_nz));
  parms.kind = kind;

  cudaError_t err = cudaMemcpy3D(&parms);
  if (err != cudaSuccess)
  {
    report_cuda_error(where, err);
    return false;
  }
  return true;
}

inline int interp_idint_like_host(double x)
{
  return static_cast<int>(x);
}

inline void lagrange_weights_ord6_host(double x, double *w)
{
  static const double denom[6] = {-120.0, 24.0, -12.0, 12.0, -24.0, 120.0};
  for (int i = 0; i < 6; ++i)
  {
    double num = 1.0;
    for (int j = 0; j < 6; ++j)
    {
      if (j != i)
        num *= (x - static_cast<double>(j));
    }
    w[i] = num / denom[i];
  }
}

inline bool map_interp_index_host(int logical_idx, int n, int *mapped_idx, int *reflected)
{
  int idx = logical_idx;
  *reflected = 0;
  if (idx < 0)
  {
    idx = -idx;
    *reflected = 1;
  }
  if (idx < 0 || idx >= n)
    return false;
  *mapped_idx = idx;
  return true;
}

inline bool compute_interp_window_host(double x, const double *coord, int n,
                                       int ordn, int allow_reflect,
                                       int *start_idx, double *cx)
{
  const double dx = coord[1] - coord[0];
  const int center = interp_idint_like_host((x - coord[0]) / dx + 0.4);
  const int cmin = allow_reflect ? (-ordn / 2 + 1) : 0;
  const int cmax = n - 1;

  int begin = center - ordn / 2 + 1;
  int end = begin + ordn - 1;
  if (begin < cmin)
  {
    begin = cmin;
    end = begin + ordn - 1;
  }
  if (end > cmax)
  {
    end = cmax;
    begin = end + 1 - ordn;
  }

  if (begin >= 0)
    *cx = (x - coord[begin]) / dx;
  else
    *cx = (x + coord[-begin]) / dx;

  *start_idx = begin;
  return (begin >= cmin && end <= cmax);
}

__global__ void enforce_ga_kernel(int n,
                                  double *dxx, double *gxy, double *gxz,
                                  double *dyy, double *gyz, double *dzz,
                                  double *Axx, double *Axy, double *Axz,
                                  double *Ayy, double *Ayz, double *Azz)
{
  const double one = 1.0;
  const double two = 2.0;
  const double one_third = 1.0 / 3.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    double lgxx = dxx[idx] + one;
    double lgyy = dyy[idx] + one;
    double lgzz = dzz[idx] + one;
    double lgxy = gxy[idx];
    double lgxz = gxz[idx];
    double lgyz = gyz[idx];

    double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
               - lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
    double scale = 1.0 / cbrt(det);

    lgxx *= scale;
    lgxy *= scale;
    lgxz *= scale;
    lgyy *= scale;
    lgyz *= scale;
    lgzz *= scale;

    dxx[idx] = lgxx - one;
    gxy[idx] = lgxy;
    gxz[idx] = lgxz;
    dyy[idx] = lgyy - one;
    gyz[idx] = lgyz;
    dzz[idx] = lgzz - one;

    double gupxx = (lgyy * lgzz - lgyz * lgyz);
    double gupxy = -(lgxy * lgzz - lgyz * lgxz);
    double gupxz = (lgxy * lgyz - lgyy * lgxz);
    double gupyy = (lgxx * lgzz - lgxz * lgxz);
    double gupyz = -(lgxx * lgyz - lgxy * lgxz);
    double gupzz = (lgxx * lgyy - lgxy * lgxy);

    double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
               + two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);

    Axx[idx] -= one_third * lgxx * trA;
    Axy[idx] -= one_third * lgxy * trA;
    Axz[idx] -= one_third * lgxz * trA;
    Ayy[idx] -= one_third * lgyy * trA;
    Ayz[idx] -= one_third * lgyz * trA;
    Azz[idx] -= one_third * lgzz * trA;
  }
}

__device__ inline int index3(int i, int j, int k, int nx, int ny)
{
  return i + j * nx + k * nx * ny;
}

__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
                                            int nx, int ny, int nz,
                                            const double soa[3], int symmetry)
{
  double sign = 1.0;
  if (i < 0)
  {
    i = -i - 1;
    sign *= soa[0];
  }
  if (j < 0)
  {
    j = -j - 1;
    sign *= soa[1];
  }
  if (k < 0)
  {
    k = -k - 1;
    sign *= soa[2];
  }
  if (i >= nx) i = nx - 1;
  if (j >= ny) j = ny - 1;
  if (k >= nz) k = nz - 1;
  (void)symmetry;
  return sign * f[index3(i, j, k, nx, ny)];
}

__device__ inline double load_prolong_cell_padded(const double *f,
                                                  int i, int j, int k,
                                                  int sx, int sy)
{
  return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
}

__device__ inline double load_restrict_cell_padded(const double *f,
                                                   int i, int j, int k,
                                                   int sx, int sy)
{
  return f[(i + 1) + (j + 1) * sx + (k + 1) * sx * sy];
}

__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
                                     int sxc, int syc,
                                     int nxc, int nyc, int nzc,
                                     int nxf, int nyf, int nzf,
                                     int lbc0, int lbc1, int lbc2,
                                     int lbf0, int lbf1, int lbf2,
                                     int ibegin, int iend,
                                     int jbegin, int jend,
                                     int kbegin, int kend,
                                     int *error_flag)
{
  const double C1 = 7.7e1 / 8.192e3;
  const double C2 = -6.93e2 / 8.192e3;
  const double C3 = 3.465e3 / 4.096e3;
  const double C4 = 1.155e3 / 4.096e3;
  const double C5 = -4.95e2 / 8.192e3;
  const double C6 = 6.3e1 / 8.192e3;
  const double w_even[6] = {C1, C2, C3, C4, C5, C6};
  const double w_odd[6] = {C6, C5, C4, C3, C2, C1};

  const int n = nxf * nyf * nzf;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nxf * nyf;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nxf;
    const int i = rem - j * nxf;

    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
      continue;

    const int ii = i + lbf0;
    const int jj = j + lbf1;
    const int kk = k + lbf2;

    const int ic0 = ii / 2 - lbc0 - 1;
    const int jc0 = jj / 2 - lbc1 - 1;
    const int kc0 = kk / 2 - lbc2 - 1;

    const double *wx = (ii & 1) ? w_odd : w_even;
    const double *wy = (jj & 1) ? w_odd : w_even;
    const double *wz = (kk & 1) ? w_odd : w_even;

    double value = 0.0;
    for (int ox = 0; ox < 6; ++ox)
    {
      const int cx = ic0 + ox;
      double yz = 0.0;
      for (int oy = 0; oy < 6; ++oy)
      {
        const int cy = jc0 + oy;
        double zsum = 0.0;
        for (int oz = 0; oz < 6; ++oz)
        {
          const int cz = kc0 + oz;
          if (cx < -2 || cx > nxc || cy < -2 || cy > nyc || cz < -2 || cz > nzc)
          {
            if (error_flag)
              *error_flag = 1;
            zsum = 0.0;
            yz = 0.0;
            value = 0.0;
            goto prolong3_cell_kernel_next_point;
          }
          zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
        }
        yz += wy[oy] * zsum;
      }
      value += wx[ox] * yz;
    }

    funf[idx] = value;
prolong3_cell_kernel_next_point:
    ;
  }
}

__global__ void prolong3_fill_equatorial_reflect_kernel(double *funcc,
                                                        int sxc, int syc,
                                                        int nxc, int nyc,
                                                        double z_soa)
{
  const int n = nxc * nyc * 3;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nxc * nyc;
    const int zoff = idx / plane;
    const int rem = idx - zoff * plane;
    const int j = rem / nxc;
    const int i = rem - j * nxc;

    const int li = i + 1;
    const int lj = j + 1;
    const int source_k = zoff + 1;
    const int target_k = -zoff;

    funcc[(li + 2) + (lj + 2) * sxc + (target_k + 2) * sxc * syc] =
        z_soa * funcc[(li + 2) + (lj + 2) * sxc + (source_k + 2) * sxc * syc];
  }
}

__global__ void restrict3_cell_kernel(const double *funf, double *func,
                                      int sxf, int syf,
                                      int nxf, int nyf, int nzf,
                                      int nxc, int nyc, int nzc,
                                      int lbc0, int lbc1, int lbc2,
                                      int lbf0, int lbf1, int lbf2,
                                      int ibegin, int iend,
                                      int jbegin, int jend,
                                      int kbegin, int kend,
                                      int *error_flag)
{
  const double C1 = 3.0 / 256.0;
  const double C2 = -25.0 / 256.0;
  const double C3 = 75.0 / 128.0;
  const double w[6] = {C1, C2, C3, C3, C2, C1};

  const int n = nxc * nyc * nzc;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nxc * nyc;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nxc;
    const int i = rem - j * nxc;

    if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
      continue;

    const int fi = 2 * (i + lbc0) - lbf0;
    const int fj = 2 * (j + lbc1) - lbf1;
    const int fk = 2 * (k + lbc2) - lbf2;

    double value = 0.0;
    for (int ox = 0; ox < 6; ++ox)
    {
      const int fx = fi - 2 + ox;
      double yz = 0.0;
      for (int oy = 0; oy < 6; ++oy)
      {
        const int fy = fj - 2 + oy;
        double zsum = 0.0;
        for (int oz = 0; oz < 6; ++oz)
        {
          const int fz = fk - 2 + oz;
          if (fx < -1 || fx > nxf || fy < -1 || fy > nyf || fz < -1 || fz > nzf)
          {
            if (error_flag)
              *error_flag = 1;
            zsum = 0.0;
            yz = 0.0;
            value = 0.0;
            goto restrict3_cell_kernel_next_point;
          }
          zsum += w[oz] * load_restrict_cell_padded(funf, fx, fy, fz, sxf, syf);
        }
        yz += w[oy] * zsum;
      }
      value += w[ox] * yz;
    }

    func[idx] = value;
restrict3_cell_kernel_next_point:
    ;
  }
}

__global__ void interp_points_ord6_kernel(int num_points, int num_var,
                                          int nx, int ny,
                                          const double *const *fields,
                                          const double *soa_flat,
                                          const int *stencil_idx,
                                          const int *stencil_reflect,
                                          const double *stencil_weights,
                                          double *out,
                                          int *error_flag)
{
  const int total = num_points * num_var;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x)
  {
    const int point_id = idx / num_var;
    const int var_id = idx - point_id * num_var;
    const double *field = fields[var_id];
    const double *soa = soa_flat + 3 * var_id;
    const int *idxp = stencil_idx + point_id * 18;
    const int *refp = stencil_reflect + point_id * 18;
    const double *wp = stencil_weights + point_id * 18;

    double value = 0.0;
    #pragma unroll
    for (int kz = 0; kz < 6; ++kz)
    {
      const int z_idx = idxp[12 + kz];
      const double z_sign = refp[12 + kz] ? soa[2] : 1.0;
      double yz_sum = 0.0;
      #pragma unroll
      for (int jy = 0; jy < 6; ++jy)
      {
        const int y_idx = idxp[6 + jy];
        const double y_sign = refp[6 + jy] ? soa[1] : 1.0;
        double x_sum = 0.0;
        #pragma unroll
        for (int ix = 0; ix < 6; ++ix)
        {
          const int x_idx = idxp[ix];
          const double x_sign = refp[ix] ? soa[0] : 1.0;
          const double sign = x_sign * y_sign * z_sign;
          x_sum += wp[ix] * sign * field[index3(x_idx, y_idx, z_idx, nx, ny)];
        }
        yz_sum += wp[6 + jy] * x_sum;
      }
      value += wp[12 + kz] * yz_sum;
    }
    out[idx] = value;
  }
  (void)error_flag;
}

__global__ void rk4_kernel(int n, double dT,
                           const double *f0,
                           double *f1,
                           double *rhs,
                           int stage)
{
  const double half = 0.5;
  const double one_sixth = 1.0 / 6.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (stage == 0)
    {
      f1[idx] = f0[idx] + half * dT * rhs[idx];
    }
    else if (stage == 1)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + half * dT * f1[idx];
    }
    else if (stage == 2)
    {
      rhs[idx] += 2.0 * f1[idx];
      f1[idx] = f0[idx] + dT * f1[idx];
    }
    else
    {
      f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
    }
  }
}

__global__ void lowerbound_kernel(int n, double *chi, double tinny)
{
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    if (chi[idx] < tinny)
      chi[idx] = tinny;
  }
}

__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
                                              int has_xmin, int has_ymin, int has_zmin,
                                              int has_xmax, int has_ymax, int has_zmax,
                                              const double *src, double *dst)
{
  const int n = nx * ny * nz;
  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;

    if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1))
    {
      dst[idx] = src[idx];
    }
  }
}

__global__ void rk4_boundary_batch_kernel(int n, int nx, int ny, int nz,
                                          int has_xmin, int has_ymin, int has_zmin,
                                          int has_xmax, int has_ymax, int has_zmax,
                                          int num_var, double dT,
                                          const double *const *state0_list,
                                          double *const *stage_list,
                                          double *const *rhs_list,
                                          int stage)
{
  const double half = 0.5;
  const double one_sixth = 1.0 / 6.0;

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;
    const bool is_boundary =
        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1);

    for (int v = 0; v < num_var; ++v)
    {
      const double *f0 = state0_list[v];
      double *f1 = stage_list[v];
      double *rhs = rhs_list[v];

      double out;
      if (stage == 0)
      {
        out = f0[idx] + half * dT * rhs[idx];
      }
      else if (stage == 1)
      {
        rhs[idx] += 2.0 * f1[idx];
        out = f0[idx] + half * dT * f1[idx];
      }
      else if (stage == 2)
      {
        rhs[idx] += 2.0 * f1[idx];
        out = f0[idx] + dT * f1[idx];
      }
      else
      {
        out = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
      }

      if (is_boundary)
        out = f0[idx];
      f1[idx] = out;
    }
  }
}

__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
                                      const double *X, const double *Y, const double *Z,
                                      double xmin, double ymin, double zmin,
                                      double xmax, double ymax, double zmax,
                                      int has_xmin, int has_ymin, int has_zmin,
                                      int has_xmax, int has_ymax, int has_zmax,
                                      int imin, int jmin, int kmin,
                                      double propspeed,
                                      const double *f0,
                                      double *target,
                                      double soa0, double soa1, double soa2,
                                      int symmetry)
{
  const double one = 1.0;
  const double two = 2.0;
  const int n = nx * ny * nz;
  const double dX = X[1] - X[0];
  const double dY = Y[1] - Y[0];
  const double dZ = Z[1] - Z[0];
  const double d2dx = one / (two * dX);
  const double d2dy = one / (two * dY);
  const double d2dz = one / (two * dZ);
  const double soa[3] = {soa0, soa1, soa2};

  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
  {
    const int plane = nx * ny;
    const int k = idx / plane;
    const int rem = idx - k * plane;
    const int j = rem / nx;
    const int i = rem - j * nx;

    const bool on_boundary =
        (has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
        (has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
        (has_zmin && k == 0) || (has_zmax && k == nz - 1);

    if (!on_boundary)
      continue;

    const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
    if (radius == 0.0)
      continue;

    const double wx = propspeed * X[i] / radius;
    const double wy = propspeed * Y[j] / radius;
    const double wz = propspeed * Z[k] / radius;

    double fx = 0.0;
    double fy = 0.0;
    double fz = 0.0;

    if (wx > 0.0)
    {
      if (i - 2 >= imin)
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
      else if (i - 1 >= imin)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wx < 0.0)
    {
      if (i + 2 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (i + 1 <= nx - 1)
        fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
      else
        fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
    }

    if (wy > 0.0)
    {
      if (j - 2 >= jmin)
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
      else if (j - 1 >= jmin)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wy < 0.0)
    {
      if (j + 2 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (j + 1 <= ny - 1)
        fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
      else
        fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
    }

    if (wz > 0.0)
    {
      if (k - 2 >= kmin)
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
      else if (k - 1 >= kmin)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
    }
    else if (wz < 0.0)
    {
      if (k + 2 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
                   + 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
                   - 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
      else if (k + 1 <= nz - 1)
        fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
      else
        fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
                   - 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
                   + load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
    }

    target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
  }
}

inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
{
  cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaLaunchKernel", err);
    return false;
  }
  err = cudaPeekAtLastError();
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaPeekAtLastError", err);
    return false;
  }
  return true;
}

} // namespace

int bssn_cuda_enforce_ga(int *ex,
                         double *dxx, double *gxy, double *gxz,
                         double *dyy, double *gyz, double *dzz,
                         double *Axx, double *Axy, double *Axz,
                         double *Ayy, double *Ayz, double *Azz)
{
  struct EnforceGaCache
  {
    CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
    CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
  };
  static thread_local EnforceGaCache cache;

  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
            copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
            copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
            copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
            copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
            copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
            copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
            copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
            copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
            copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
            copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
            copy_to_device_preferring_device(cache.Azz, Azz, bytes);

  if (ok)
  {
    double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
    double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
    double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
    double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
    void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
                    &d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
    ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
  }

  if (ok)
  {
    // The next GPU RHS stage consumes these fields immediately.
    // Keep them device-resident and expose the mapping so gpu_rhs() can
    // reuse them via D2D copies instead of forcing an intermediate D2H round-trip.
    bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr);
    bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr);
    bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr);
    bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr);
    bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr);
    bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr);
    bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr);
    bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr);
    bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr);
    bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr);
    bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr);
    bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr);
  }

  return ok ? 0 : 1;
}

int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               const double *X, const double *Y, const double *Z,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
                               const double *phi_field,
                               const double *lap_field,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
                               double propspeed,
                               const double SoA[3],
                               int symmetry,
                               int lev,
                               int rk_stage,
                               bool force_host_boundary_fix,
                               bool download_to_host)
{
  Rk4VarCache &cache = rk4_var_cache_map()[state0];

  int nx = ex[0];
  int ny = ex[1];
  int nz = ex[2];
  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
  const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
  const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  const bool need_bam_boundary = (lev == 0);
  const bool need_coord_copy = need_bam_boundary;
  const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
  const bool need_stage_input = (rk_stage != 0);
  bool ok = true;
  if (need_coord_copy &&
      (cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
       cache.nx != nx || cache.ny != ny || cache.nz != nz))
  {
    ok = copy_to_device(cache.X, X, bytes_x) &&
         copy_to_device(cache.Y, Y, bytes_y) &&
         copy_to_device(cache.Z, Z, bytes_z);
    if (ok)
    {
      cache.host_X = X;
      cache.host_Y = Y;
      cache.host_Z = Z;
      cache.nx = nx;
      cache.ny = ny;
      cache.nz = nz;
    }
  }

  const bool refresh_state0 =
      (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
  const bool refresh_rhs =
      (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
  double *stage_ptr = nullptr;
  const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
  const double *mapped_boundary_ptr = need_boundary_input ? bssn_gpu_find_device_buffer(boundary_src) : nullptr;
  const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
  const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;

  if (refresh_state0 && !mapped_state0_ptr)
    bssn_gpu_prepare_host_buffer(state0, n);
  if (need_boundary_input && !mapped_boundary_ptr)
    bssn_gpu_prepare_host_buffer(boundary_src, n);
  if (need_stage_input && !mapped_stage_ptr)
    bssn_gpu_prepare_host_buffer(stage_data, n);
  if (refresh_rhs && !mapped_rhs_ptr)
    bssn_gpu_prepare_host_buffer(rhs_accum, n);

  ok = ok &&
       (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
       (!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
       (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));

  if (ok && need_stage_input)
  {
    if (mapped_stage_ptr)
    {
      stage_ptr = const_cast<double *>(mapped_stage_ptr);
    }
    else
    {
      ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
      stage_ptr = cache.stage.ptr;
    }
  }
  else if (ok)
  {
    ok = ensure_capacity(cache.stage, bytes);
    stage_ptr = cache.stage.ptr;
  }

  if (!ok)
    return 1;

  if (refresh_state0)
  {
    cache.host_state0 = state0;
    bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
  }
  if (refresh_rhs)
  {
    cache.host_rhs = rhs_accum;
    cache.rhs_resident = true;
    bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
  }

  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
  double dZ = Z[1] - Z[0];
  const int no_symm = 0, eq_symm = 1, octant = 2;
  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
  double soa0 = SoA[0];
  double soa1 = SoA[1];
  double soa2 = SoA[2];

  if (need_bam_boundary)
  {
    int imin = 1;
    int jmin = 1;
    int kmin = 1;
    if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
    if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
    if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;

    double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
    double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
    double *d_rhs = cache.rhs.ptr;
    double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr;
    const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
    void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
                    &xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
                    &has_xmin, &has_ymin, &has_zmin,
                    &has_xmax, &has_ymax, &has_zmax,
                    &imin, &jmin, &kmin, &propspeed,
                    &bam_source, &bam_target,
                    &soa0, &soa1, &soa2,
                    &symmetry};
    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

  if (ok && (lev == 0 || !force_host_boundary_fix))
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
    ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
  }

  if (ok && lev > 0 && !force_host_boundary_fix)
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
    void *args[] = {&nx, &ny, &nz,
                    &has_xmin, &has_ymin, &has_zmin,
                    &has_xmax, &has_ymax, &has_zmax,
                    &d_state0, &d_stage};
    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
  }

  if (ok && lev > 0 && force_host_boundary_fix)
  {
    double *host_state0 = const_cast<double *>(state0);
    double *host_phi = const_cast<double *>(phi_field);
    double *host_lap = const_cast<double *>(lap_field);
    double *host_rhs = rhs_accum;

    // state0/phi/lap are read-only during the current RK step, so the host copies
    // remain valid even if cached device mirrors exist. Only the RHS accumulator
    // is updated on device and must be synchronized back for the CPU fallback.
    ok = sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
    if (ok)
    {
      bssn_gpu_prepare_host_buffer(stage_data, n);
      cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess)
      {
        report_cuda_error("cudaMemcpy(D2H) stage_data", err);
        ok = false;
      }
    }

    if (ok)
    {
      int rk_stage_host = rk_stage;
      int cor = 1;
      f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
      f_sommerfeld_rout(ex,
                        const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
                        xmin, ymin, zmin, xmax, ymax, zmax,
                        dT,
                        host_phi, host_lap,
                        host_state0, stage_data,
                        const_cast<double *>(SoA),
                        symmetry, cor);

      cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
      if (err != cudaSuccess)
      {
        report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
        ok = false;
      }
    }
  }

  if (ok)
  {
    bssn_gpu_register_device_buffer(stage_data, stage_ptr);
    if (download_to_host)
    {
      cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
      ok = err == cudaSuccess;
    }
  }

  return ok ? 0 : 1;
}

int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
                                 const double *X, const double *Y, const double *Z,
                                 double xmin, double ymin, double zmin,
                                 double xmax, double ymax, double zmax,
                                 int symmetry,
                                 const double *const *state0_list,
                                 double *const *stage_data_list,
                                 double *const *rhs_accum_list,
                                 int num_var,
                                 int rk_stage,
                                 bool download_to_host)
{
  if (!state0_list || !stage_data_list || !rhs_accum_list || num_var <= 0)
    return 1;

  const int nx = ex[0];
  const int ny = ex[1];
  const int nz = ex[2];
  const int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  std::vector<const double *> host_state0_ptrs(num_var);
  std::vector<double *> host_stage_ptrs(num_var);
  std::vector<double *> host_rhs_ptrs(num_var);

  bool ok = true;
  for (int v = 0; v < num_var && ok; ++v)
  {
    const double *state0 = state0_list[v];
    double *stage_data = stage_data_list[v];
    double *rhs_accum = rhs_accum_list[v];
    Rk4VarCache &cache = rk4_var_cache_map()[state0];

    const bool refresh_state0 =
        (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
    const bool refresh_rhs =
        (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
    const bool need_stage_input = (rk_stage != 0);
    double *stage_ptr = nullptr;
    const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
    const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
    const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;

    if (refresh_state0 && !mapped_state0_ptr)
      bssn_gpu_prepare_host_buffer(state0, n);
    if (need_stage_input && !mapped_stage_ptr)
      bssn_gpu_prepare_host_buffer(stage_data, n);
    if (refresh_rhs && !mapped_rhs_ptr)
      bssn_gpu_prepare_host_buffer(rhs_accum, n);

    ok = (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
         (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
    if (!ok)
      break;

    if (need_stage_input)
    {
      if (mapped_stage_ptr)
      {
        stage_ptr = const_cast<double *>(mapped_stage_ptr);
      }
      else
      {
        ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
        stage_ptr = cache.stage.ptr;
      }
    }
    else
    {
      ok = ensure_capacity(cache.stage, bytes);
      stage_ptr = cache.stage.ptr;
    }
    if (!ok)
      break;

    if (refresh_state0)
    {
      cache.host_state0 = state0;
      cache.nx = nx;
      cache.ny = ny;
      cache.nz = nz;
      bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
    }
    if (refresh_rhs)
    {
      cache.host_rhs = rhs_accum;
      cache.rhs_resident = true;
      bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
    }

    host_state0_ptrs[v] = cache.state0.ptr;
    host_stage_ptrs[v] = stage_ptr;
    host_rhs_ptrs[v] = cache.rhs.ptr;
  }

  if (!ok)
    return 1;

  Rk4BatchCache &batch_cache = rk4_batch_cache();
  ok = ensure_capacity(batch_cache.state0_ptrs, ptr_bytes) &&
       ensure_capacity(batch_cache.stage_ptrs, ptr_bytes) &&
       ensure_capacity(batch_cache.rhs_ptrs, ptr_bytes);
  if (!ok)
    return 1;

  cudaError_t err = cudaMemcpy(batch_cache.state0_ptrs.ptr, &host_state0_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) batch state0 ptrs", err);
    return 1;
  }
  err = cudaMemcpy(batch_cache.stage_ptrs.ptr, &host_stage_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) batch stage ptrs", err);
    return 1;
  }
  err = cudaMemcpy(batch_cache.rhs_ptrs.ptr, &host_rhs_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) batch rhs ptrs", err);
    return 1;
  }

  double dX = X[1] - X[0];
  double dY = Y[1] - Y[0];
  double dZ = Z[1] - Z[0];
  const int no_symm = 0, octant = 2;
  int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
  int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
  int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
  int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
  int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
  int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);

  int n_arg = n, nx_arg = nx, ny_arg = ny, nz_arg = nz;
  int num_var_arg = num_var, rk_stage_arg = rk_stage;
  void *args[] = {&n_arg, &nx_arg, &ny_arg, &nz_arg,
                  &has_xmin, &has_ymin, &has_zmin,
                  &has_xmax, &has_ymax, &has_zmax,
                  &num_var_arg, &dT,
                  &batch_cache.state0_ptrs.ptr,
                  &batch_cache.stage_ptrs.ptr,
                  &batch_cache.rhs_ptrs.ptr,
                  &rk_stage_arg};
  ok = launch_kernel(grid, block, (const void *)rk4_boundary_batch_kernel, args);

  if (!ok)
    return 1;

  for (int v = 0; v < num_var; ++v)
  {
    bssn_gpu_register_device_buffer(stage_data_list[v], host_stage_ptrs[v]);
    if (download_to_host)
    {
      err = cudaMemcpy(stage_data_list[v], host_stage_ptrs[v], bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess)
      {
        report_cuda_error("cudaMemcpy(D2H) batch stage_data", err);
        return 1;
      }
    }
  }

  return 0;
}

void bssn_cuda_release_rk4_caches()
{
  std::unordered_map<const double *, Rk4VarCache> &cache_map = rk4_var_cache_map();
  for (std::unordered_map<const double *, Rk4VarCache>::iterator it = cache_map.begin();
       it != cache_map.end(); ++it)
  {
    Rk4VarCache &cache = it->second;
    release_buffer(cache.X);
    release_buffer(cache.Y);
    release_buffer(cache.Z);
    release_buffer(cache.state0);
    release_buffer(cache.boundary);
    release_buffer(cache.stage);
    release_buffer(cache.rhs);
  }
  cache_map.clear();
  release_buffer(rk4_batch_cache().state0_ptrs);
  release_buffer(rk4_batch_cache().stage_ptrs);
  release_buffer(rk4_batch_cache().rhs_ptrs);
}

void bssn_cuda_release_interp_caches()
{
  release_interp_batch_cache(interp_batch_cache());
}

int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
{
  static thread_local CachedBuffer d_chi;

  int n = count_points(ex);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  dim3 block(256);
  dim3 grid(div_up(n, static_cast<int>(block.x)));

  double *device_chi = nullptr;
  const double *mapped = bssn_gpu_find_device_buffer(chi);
  bool ok = true;
  if (mapped)
  {
    device_chi = const_cast<double *>(mapped);
  }
  else
  {
    ok = copy_to_device_preferring_device(d_chi, chi, bytes);
    device_chi = d_chi.ptr;
  }

  if (ok)
  {
    double *ptr = device_chi;
    void *args[] = {&n, &ptr, &tinny};
    ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
  }

  if (ok)
  {
    bssn_gpu_register_device_buffer(chi, device_chi);
    if (download_to_host)
    {
      bssn_gpu_prepare_host_buffer(chi, n);
      cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
      if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
      ok = err == cudaSuccess;
    }
  }
  return ok ? 0 : 1;
}

int bssn_cuda_download_buffer(int *ex, double *host_ptr)
{
  const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
  if (!device_ptr)
    return 1;

  const int n = count_points(ex);
  bssn_gpu_prepare_host_buffer(host_ptr, n);
  const size_t bytes = static_cast<size_t>(n) * sizeof(double);
  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) buffered download", err);
    return 1;
  }
  return 0;
}

int bssn_cuda_interp_points_batch(const int *ex,
                                  const double *X, const double *Y, const double *Z,
                                  const double *const *fields,
                                  const double *soa_flat,
                                  int num_var,
                                  const double *px, const double *py, const double *pz,
                                  int num_points,
                                  int ordn,
                                  int symmetry,
                                  double *out)
{
  if (!ex || !X || !Y || !Z || !fields || !soa_flat || !px || !py || !pz || !out)
    return 1;
  if (num_var <= 0 || num_points <= 0 || ordn != 6)
    return 1;
  if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
    return 1;

  InterpBatchCache &cache = interp_batch_cache();

  const int nx = ex[0];
  const int ny = ex[1];
  const int nz = ex[2];
  const int field_points = count_points(ex);
  const size_t field_bytes = static_cast<size_t>(field_points) * sizeof(double);
  const size_t out_bytes = static_cast<size_t>(num_points) * static_cast<size_t>(num_var) * sizeof(double);
  const size_t soa_bytes = static_cast<size_t>(3 * num_var) * sizeof(double);
  const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
  const size_t point_stencil_doubles = static_cast<size_t>(num_points) * 18;
  const size_t point_stencil_ints = static_cast<size_t>(num_points) * 18;
  const size_t weights_bytes = point_stencil_doubles * sizeof(double);
  const size_t indices_bytes = point_stencil_ints * sizeof(int);

  bool ok = true;
  InterpStencilCacheEntry &stencil_cache = cache.stencil_entry;
  const bool stencil_match =
      stencil_cache.valid &&
      stencil_cache.X == X && stencil_cache.Y == Y && stencil_cache.Z == Z &&
      stencil_cache.px == px && stencil_cache.py == py && stencil_cache.pz == pz &&
      stencil_cache.nx == nx && stencil_cache.ny == ny && stencil_cache.nz == nz &&
      stencil_cache.num_points == num_points && stencil_cache.ordn == ordn &&
      stencil_cache.symmetry == symmetry;

  if (!stencil_match)
  {
    release_interp_stencil_cache(stencil_cache);
    stencil_cache.X = X;
    stencil_cache.Y = Y;
    stencil_cache.Z = Z;
    stencil_cache.px = px;
    stencil_cache.py = py;
    stencil_cache.pz = pz;
    stencil_cache.nx = nx;
    stencil_cache.ny = ny;
    stencil_cache.nz = nz;
    stencil_cache.num_points = num_points;
    stencil_cache.ordn = ordn;
    stencil_cache.symmetry = symmetry;
    stencil_cache.valid = true;

    std::vector<double> host_weights(point_stencil_doubles);
    std::vector<int> host_indices(point_stencil_ints);
    std::vector<int> host_reflect(point_stencil_ints);
    const double dx = X[1] - X[0];
    const double dy = Y[1] - Y[0];
    const double dz = Z[1] - Z[0];
    const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx);
    const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy);
    const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz);
    for (int p = 0; p < num_points; ++p)
    {
      int start_x = 0, start_y = 0, start_z = 0;
      double cx = 0.0, cy = 0.0, cz = 0.0;
      const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx);
      const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy);
      const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz);
      if (!ok_x || !ok_y || !ok_z)
        return 1;

      lagrange_weights_ord6_host(cx, host_weights.data() + p * 18);
      lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6);
      lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12);

      for (int i = 0; i < 6; ++i)
      {
        if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i]))
          return 1;
        if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i]))
          return 1;
        if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i]))
          return 1;
      }
    }

    ok = ok &&
         copy_to_device(stencil_cache.weights, host_weights.data(), weights_bytes) &&
         copy_to_device(stencil_cache.indices, host_indices.data(), indices_bytes) &&
         copy_to_device(stencil_cache.reflect, host_reflect.data(), indices_bytes);
    if (!ok)
      return 1;
  }

  ok = ok &&
       copy_to_device(cache.soa, soa_flat, soa_bytes) &&
       ensure_capacity(cache.out, out_bytes) &&
       ensure_capacity(cache.field_ptrs, ptr_bytes) &&
       ensure_capacity(cache.error_flag, sizeof(int));
  if (!ok)
    return 1;

  if (static_cast<int>(cache.host_field_copies.size()) < num_var)
    cache.host_field_copies.resize(num_var);

  std::vector<const double *> device_fields(num_var);
  for (int v = 0; v < num_var; ++v)
  {
    const double *device_field = bssn_gpu_find_device_buffer(fields[v]);
    if (!device_field)
    {
      ok = copy_to_device(cache.host_field_copies[v], fields[v], field_bytes);
      device_field = cache.host_field_copies[v].ptr;
    }
    device_fields[v] = device_field;
    if (!ok || !device_fields[v])
      return 1;
  }

  int zero = 0;
  cudaError_t err = cudaMemcpy(cache.field_ptrs.ptr, device_fields.data(), ptr_bytes, cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) field_ptrs", err);
    return 1;
  }
  err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) interp_error_flag", err);
    return 1;
  }

  dim3 block(128);
  dim3 grid(div_up(num_points * num_var, static_cast<int>(block.x)));
  if (grid.x > 4096)
    grid.x = 4096;

  int nx_local = nx;
  int ny_local = ny;
  const double *dsoa = cache.soa.ptr;
  const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
  const double *dweights = stencil_cache.weights.ptr;
  const int *dindices = stencil_cache.indices.ptr;
  const int *dreflect = stencil_cache.reflect.ptr;
  double *dout = cache.out.ptr;
  int *derror = cache.error_flag.ptr;

  void *args[] = {
      &num_points, &num_var,
      &nx_local, &ny_local,
      &dfields,
      &dsoa,
      &dindices,
      &dreflect,
      &dweights,
      &dout,
      &derror};
  ok = launch_kernel(grid, block, (const void *)interp_points_ord6_kernel, args);
  if (!ok)
    return 1;

  int error_flag = 0;
  err = cudaMemcpy(&error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) interp_error_flag", err);
    return 1;
  }
  if (error_flag != 0)
    return 1;

  err = cudaMemcpy(out, cache.out.ptr, out_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) interp_out", err);
    return 1;
  }

  return 0;
}

int bssn_cuda_prolong3_pack(int wei,
                            const double *llbc, const double *uubc, const int *extc, const double *func,
                            const double *llbf, const double *uubf, const int *extf, double *funf,
                            const double *llbp, const double *uubp,
                            const double *SoA, int symmetry)
{
  const int no_symm = 0;
  const int eq_symm = 1;

  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
    return 1;

  // Keep the CPU fallback for octant and any unrecognized symmetry modes.
  if (symmetry != no_symm && symmetry != eq_symm)
    return 1;

  auto idint_like = [](double x) -> int {
    return static_cast<int>(std::trunc(x));
  };

  double base[3];
  double CD[3];
  double FD[3];
  int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
  for (int d = 0; d < 3; ++d)
  {
    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
      return 1;

    if (llbc[d] <= llbf[d])
    {
      base[d] = llbc[d];
    }
    else
    {
      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
    }

    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
    lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
    lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
    ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
    ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
    (void)ubc[d];
    (void)ubf[d];
  }

  const int imino = lbp[0] - lbf[0] + 1;
  const int imaxo = ubp[0] - lbf[0] + 1;
  const int jmino = lbp[1] - lbf[1] + 1;
  const int jmaxo = ubp[1] - lbf[1] + 1;
  const int kmino = lbp[2] - lbf[2] + 1;
  const int kmaxo = ubp[2] - lbf[2] + 1;

  const int imini = lbpc[0] - lbc[0] + 1;
  const int imaxi = ubpc[0] - lbc[0] + 1;
  const int jmini = lbpc[1] - lbc[1] + 1;
  const int jmaxi = ubpc[1] - lbc[1] + 1;
  const int kmini = lbpc[2] - lbc[2] + 1;
  const int kmaxi = ubpc[2] - lbc[2] + 1;

  if (imino < 1 || jmino < 1 || kmino < 1 ||
      imini < 1 || jmini < 1 || kmini < 1 ||
      imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
      imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
  {
    return 1;
  }

  const bool thin_x = (extf[0] <= 2 * wei);
  const bool thin_y = (extf[1] <= 2 * wei);
  const bool thin_z = (extf[2] <= 2 * wei);
  // Keep thin slabs on the CPU path until the dedicated slab kernel is
  // fully validated. The first z-thin specialization still triggers illegal
  // accesses on the common extf=(48,48,6) case.
  if (thin_x || thin_y || thin_z)
    return 1;

  auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
    int ii = fine_idx + lbf_d - 1;
    return ii / 2 - lbc_d + 1;
  };
  const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
  const int ic_max = coarse_center_index(imaxo, lbf[0], lbc[0]);
  const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
  const int jc_max = coarse_center_index(jmaxo, lbf[1], lbc[1]);
  const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
  const int kc_max = coarse_center_index(kmaxo, lbf[2], lbc[2]);
  const bool need_equatorial_z_reflect = (symmetry == eq_symm && kc_min - 2 < 1);

  // Match the active Fortran implementation's later stencil safety check.
  if (ic_max + 3 > extc[0] || jc_max + 3 > extc[1] || kc_max + 3 > extc[2] ||
      ic_min < 1 || jc_min < 1 || kc_min < 0)
  {
    return 1;
  }

  // Match the active Cell-centered Fortran fast path: only the z<0
  // reflection needed by equatorial symmetry is handled here.
  if (ic_min - 2 < 1 || jc_min - 2 < 1 || (kc_min - 2 < 1 && !need_equatorial_z_reflect))
    return 1;

  struct ProlongCache
  {
    CachedBuffer coarse;
    CachedBuffer fine;
    CachedIntBuffer error_flag;
  };
  static thread_local ProlongCache cache;

  int nxf = extf[0], nyf = extf[1], nzf = extf[2];
  const int stage_ix_lo = ic_min - 2;
  const int stage_ix_hi = ic_max + 3;
  const int stage_iy_lo = jc_min - 2;
  const int stage_iy_hi = jc_max + 3;
  const int stage_iz_lo = need_equatorial_z_reflect ? 1 : (kc_min - 2);
  const int stage_iz_hi = kc_max + 3;
  int nxc = stage_ix_hi - stage_ix_lo + 1;
  int nyc = stage_iy_hi - stage_iy_lo + 1;
  int nzc = stage_iz_hi - stage_iz_lo + 1;
  int sxc = nxc + 3;
  int syc = nyc + 3;
  int szc = nzc + 3;
  int fine_points = nxf * nyf * nzf;
  const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
  if (!copy_region_to_padded_stage(cache.coarse,
                                   func,
                                   extc,
                                   stage_ix_lo - 1,
                                   stage_iy_lo - 1,
                                   stage_iz_lo - 1,
                                   nxc,
                                   nyc,
                                   nzc,
                                   sxc,
                                   syc,
                                   szc,
                                   3,
                                   3,
                                   3,
                                   "cudaMemcpy3D prolong3 stage"))
  {
    return 1;
  }

  double *d_func = cache.coarse.ptr;

  if (need_equatorial_z_reflect)
  {
    dim3 reflect_block(256);
    dim3 reflect_grid(div_up(nxc * nyc * 3, static_cast<int>(reflect_block.x)));
    double z_soa = SoA[2];
    void *reflect_args[] = {
        &d_func,
        &sxc,
        &syc,
        &nxc,
        &nyc,
        &z_soa};
    if (!launch_kernel(reflect_grid, reflect_block,
                       (const void *)prolong3_fill_equatorial_reflect_kernel,
                       reflect_args))
    {
      return 1;
    }
  }

  if (!ensure_capacity(cache.fine, fine_bytes))
  {
    return 1;
  }
  if (!ensure_capacity(cache.error_flag, sizeof(int)))
  {
    return 1;
  }

  int zero = 0;
  cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) prolong3_error_flag", err);
    return 1;
  }

  dim3 block(256);
  dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
  double *d_funf = cache.fine.ptr;
  int lbc_local[3] = {lbc[0] + stage_ix_lo - 1,
                      lbc[1] + stage_iy_lo - 1,
                      lbc[2] + stage_iz_lo - 1};
  int ibegin = imino - 1, iend = imaxo - 1;
  int jbegin = jmino - 1, jend = jmaxo - 1;
  int kbegin = kmino - 1, kend = kmaxo - 1;
  void *args[] = {&d_func, &d_funf,
                  &sxc, &syc,
                  &nxc, &nyc, &nzc,
                  &nxf, &nyf, &nzf,
                  &lbc_local[0], &lbc_local[1], &lbc_local[2],
                  &lbf[0], &lbf[1], &lbf[2],
                  &ibegin, &iend,
                  &jbegin, &jend,
                  &kbegin, &kend,
                  &cache.error_flag.ptr};

  if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
    return 1;

  int host_error_flag = 0;
  err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) prolong3_error_flag", err);
    return 1;
  }
  if (host_error_flag)
    return 1;

  err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) prolong3", err);
    return 1;
  }
  return 0;
}

int bssn_cuda_restrict3_pack(int wei,
                             const double *llbc, const double *uubc, const int *extc, double *func,
                             const double *llbf, const double *uubf, const int *extf, const double *funf,
                             const double *llbr, const double *uubr,
                             const double *SoA, int symmetry)
{
  const int no_symm = 0;
  const int eq_symm = 1;

  if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbr || !uubr || !SoA)
    return 1;

  if (symmetry != no_symm && symmetry != eq_symm)
    return 1;

  auto idint_like = [](double x) -> int {
    return static_cast<int>(std::trunc(x));
  };

  double base[3];
  double CD[3];
  double FD[3];
  int lbc[3], ubc[3], lbf[3], ubf[3], lbr_i[3], ubr_i[3], lbrf[3], ubrf[3];
  for (int d = 0; d < 3; ++d)
  {
    CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
    FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
    if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
      return 1;

    if (llbc[d] <= llbf[d])
    {
      base[d] = llbc[d];
    }
    else
    {
      int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
      base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
    }

    lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
    ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
    lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
    ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
    lbr_i[d] = idint_like((llbr[d] - base[d]) / CD[d] + 0.4) + 1;
    lbrf[d] = idint_like((llbr[d] - base[d]) / FD[d] + 0.4) + 1;
    ubr_i[d] = idint_like((uubr[d] - base[d]) / CD[d] + 0.4);
    ubrf[d] = idint_like((uubr[d] - base[d]) / FD[d] + 0.4);
    (void)ubc[d];
    (void)ubf[d];
  }

  const int imino = lbr_i[0] - lbc[0] + 1;
  const int imaxo = ubr_i[0] - lbc[0] + 1;
  const int jmino = lbr_i[1] - lbc[1] + 1;
  const int jmaxo = ubr_i[1] - lbc[1] + 1;
  const int kmino = lbr_i[2] - lbc[2] + 1;
  const int kmaxo = ubr_i[2] - lbc[2] + 1;

  const int imini = lbrf[0] - lbf[0] + 1;
  const int imaxi = ubrf[0] - lbf[0] + 1;
  const int jmini = lbrf[1] - lbf[1] + 1;
  const int jmaxi = ubrf[1] - lbf[1] + 1;
  const int kmini = lbrf[2] - lbf[2] + 1;
  const int kmaxi = ubrf[2] - lbf[2] + 1;

  if (imino < 1 || jmino < 1 || kmino < 1 ||
      imini < 1 || jmini < 1 || kmini < 1 ||
      imaxo > extc[0] || jmaxo > extc[1] || kmaxo > extc[2] ||
      imaxi > extf[0] - 2 || jmaxi > extf[1] - 2 || kmaxi > extf[2] - 2)
  {
    return 1;
  }

  // Keep thin restrict slabs on the established CPU path for now.
  if (extc[0] <= 2 * wei || extc[1] <= 2 * wei || extc[2] <= 2 * wei ||
      extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
  {
    return 1;
  }

  const int fi_min = 2 * (imino + lbc[0] - 1) - lbf[0];
  const int fi_max = 2 * (imaxo + lbc[0] - 1) - lbf[0];
  const int fj_min = 2 * (jmino + lbc[1] - 1) - lbf[1];
  const int fj_max = 2 * (jmaxo + lbc[1] - 1) - lbf[1];
  const int fk_min = 2 * (kmino + lbc[2] - 1) - lbf[2];
  const int fk_max = 2 * (kmaxo + lbc[2] - 1) - lbf[2];

  const int ii_lo = fi_min - 2;
  const int ii_hi = fi_max + 3;
  const int jj_lo = fj_min - 2;
  const int jj_hi = fj_max + 3;
  const int kk_lo = fk_min - 2;
  const int kk_hi = fk_max + 3;

  if (ii_hi > extf[0] || jj_hi > extf[1] || kk_hi > extf[2] ||
      ii_lo < -1 || jj_lo < -1 || kk_lo < -1)
  {
    return 1;
  }

  const bool need_equatorial_z_reflect = (symmetry == eq_symm && kk_lo < 1);
  // Keep symmetry-reflected restrict windows on the CPU path for now.
  if (ii_lo < 1 || jj_lo < 1 || kk_lo < 1)
    return 1;

  struct RestrictCache
  {
    CachedBuffer fine;
    CachedBuffer coarse;
    CachedIntBuffer error_flag;
  };
  static thread_local RestrictCache cache;

  int nxc = extc[0];
  int nyc = extc[1];
  int nzc = extc[2];
  const int stage_fx_lo = ii_lo;
  const int stage_fx_hi = ii_hi;
  const int stage_fy_lo = jj_lo;
  const int stage_fy_hi = jj_hi;
  const int stage_fz_lo = kk_lo;
  const int stage_fz_hi = kk_hi;
  int nxf = stage_fx_hi - stage_fx_lo + 1;
  int nyf = stage_fy_hi - stage_fy_lo + 1;
  int nzf = stage_fz_hi - stage_fz_lo + 1;
  int sxf = nxf + 2;
  int syf = nyf + 2;
  int szf = nzf + 2;
  int coarse_points = nxc * nyc * nzc;
  const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
  (void)need_equatorial_z_reflect;

  if (!copy_region_to_padded_stage(cache.fine,
                                   funf,
                                   extf,
                                   stage_fx_lo - 1,
                                   stage_fy_lo - 1,
                                   stage_fz_lo - 1,
                                   nxf,
                                   nyf,
                                   nzf,
                                   sxf,
                                   syf,
                                   szf,
                                   2,
                                   2,
                                   2,
                                   "cudaMemcpy3D restrict3 stage"))
    return 1;
  if (!ensure_capacity(cache.coarse, coarse_bytes))
    return 1;
  if (!ensure_capacity(cache.error_flag, sizeof(int)))
    return 1;

  int zero = 0;
  cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(H2D) restrict3_error_flag", err);
    return 1;
  }

  dim3 block(256);
  dim3 grid(div_up(coarse_points, static_cast<int>(block.x)));
  double *d_funf = cache.fine.ptr;
  double *d_func = cache.coarse.ptr;
  int lbf_local[3] = {lbf[0] + stage_fx_lo - 1,
                      lbf[1] + stage_fy_lo - 1,
                      lbf[2] + stage_fz_lo - 1};
  int ibegin = imino - 1, iend = imaxo - 1;
  int jbegin = jmino - 1, jend = jmaxo - 1;
  int kbegin = kmino - 1, kend = kmaxo - 1;
  void *args[] = {&d_funf, &d_func,
                  &sxf, &syf,
                  &nxf, &nyf, &nzf,
                  &nxc, &nyc, &nzc,
                  &lbc[0], &lbc[1], &lbc[2],
                  &lbf_local[0], &lbf_local[1], &lbf_local[2],
                  &ibegin, &iend,
                  &jbegin, &jend,
                  &kbegin, &kend,
                  &cache.error_flag.ptr};

  if (!launch_kernel(grid, block, (const void *)restrict3_cell_kernel, args))
    return 1;

  int host_error_flag = 0;
  err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) restrict3_error_flag", err);
    return 1;
  }
  if (host_error_flag)
    return 1;

  err = cudaMemcpy(func, cache.coarse.ptr, coarse_bytes, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess)
  {
    report_cuda_error("cudaMemcpy(D2H) restrict3", err);
    return 1;
  }

  return 0;
}