Add direct CUDA resident-state sync path and profiling hooks

2026-04-13 00:57:05 +08:00
parent 7f2a391dd2
commit 636e35bfd8
5 changed files with 1188 additions and 527 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -100,12 +100,14 @@ namespace Parallel
    MyList<gridseg> **combined_dst;
    int *send_lengths;
    int *recv_lengths;
-    double **send_bufs;
+    double **send_bufs;
-    double **recv_bufs;
+    double **recv_bufs;
-    int *send_buf_caps;
+    int *send_buf_caps;
-    int *recv_buf_caps;
+    int *recv_buf_caps;
-    MPI_Request *reqs;
+    unsigned char *send_buf_pinned;
-    MPI_Status *stats;
+    unsigned char *recv_buf_pinned;
    MPI_Request *reqs;
    MPI_Status *stats;
    int max_reqs;
    bool lengths_valid;
    int *tc_req_node;
@@ -116,10 +118,11 @@ namespace Parallel
    void destroy();
  };
-  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
+  void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
-  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
+  void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
-                       MyList<var> *VarList1, MyList<var> *VarList2,
+  void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
-                       int Symmetry, SyncCache &cache);
+                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);
  struct AsyncSyncState {
    int req_no;
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -76,6 +76,48 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
  return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
 }
 bool bssn_cuda_use_resident_sync(int lev)
 {
 #ifdef WithShell
  (void)lev;
  return false;
 #else
  return lev == 0;
 #endif
 }
 void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
 {
  MyList<Patch> *Pp = PatL;
  while (Pp)
  {
    MyList<Block> *BP = Pp->data->blb;
    while (BP)
    {
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
        double *state_out[BSSN_CUDA_STATE_COUNT];
        if (!fill_bssn_cuda_views(cg, vars, state_out))
        {
          cout << "CUDA BSSN state list mismatch on resident state download" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
        {
          cout << "CUDA resident state download failed" << endl;
          MPI_Abort(MPI_COMM_WORLD, 1);
        }
        bssn_cuda_release_step_ctx(cg);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
    }
    Pp = Pp->next;
  }
 }
 } // namespace
 #endif
@@ -3058,13 +3100,18 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps
 #if (PSTR == 0)
 #if 1
-void bssn_class::Step(int lev, int YN)
+void bssn_class::Step(int lev, int YN)
-{
+{
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
+  setpbh(BH_num, Porg0, Mass, BH_num_input);
-
+
-  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
-
+#if USE_CUDA_BSSN
-// new code 2013-2-15, zjcao
+  const bool use_cuda_resident_sync = bssn_cuda_use_resident_sync(lev);
 #else
  const bool use_cuda_resident_sync = false;
 #endif
 // new code 2013-2-15, zjcao
 #if (MAPBH == 1)
  // for black hole position
  if (BH_num > 0 && lev == GH->levels - 1)
@@ -3128,15 +3175,17 @@ void bssn_class::Step(int lev, int YN)
      Block *cg = BP->data;
      if (myrank == cg->rank)
      {
-#if (AGM == 0)
+#if (AGM == 0)
-        f_enforce_ga(cg->shape,
+        if (!use_cuda_resident_sync)
-                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+          f_enforce_ga(cg->shape,
-                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                       cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
+                       cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
+                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
-#endif
+                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
-
+#endif
        bool used_gpu_substep = false;
        bool used_gpu_resident_state = false;
 #if USE_CUDA_BSSN
        {
          double *state_in[BSSN_CUDA_STATE_COUNT];
@@ -3154,6 +3203,11 @@ void bssn_class::Step(int lev, int YN)
            MPI_Abort(MPI_COMM_WORLD, 1);
          }
          int apply_bam_bc = 0;
          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
          int apply_enforce_ga = 0;
 #if (AGM == 0)
          apply_enforce_ga = 1;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
          apply_bam_bc = (lev == 0) ? 1 : 0;
@@ -3164,7 +3218,8 @@ void bssn_class::Step(int lev, int YN)
                                    state_in, state_out, matter,
                                    propspeed, soa_flat, Pp->data->bbox,
                                    dT_lev, TRK4, iter_count, apply_bam_bc,
-                                    Symmetry, lev, ndeps, pre))
+                                    Symmetry, lev, ndeps, pre,
                                    keep_resident_state, apply_enforce_ga, chitiny))
          {
            cout << "CUDA predictor substep failed in domain: ("
                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
@@ -3173,6 +3228,7 @@ void bssn_class::Step(int lev, int YN)
            ERROR = 1;
          }
          used_gpu_substep = true;
          used_gpu_resident_state = (keep_resident_state != 0);
        }
 #endif
        if (!used_gpu_substep)
@@ -3277,8 +3333,9 @@ void bssn_class::Step(int lev, int YN)
            varlrhs = varlrhs->next;
          }
        }
-        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
+        if (!used_gpu_resident_state)
-      }
+          f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
      }
      if (BP == Pp->data->ble)
        break;
      BP = BP->next;
@@ -3436,9 +3493,9 @@ void bssn_class::Step(int lev, int YN)
    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req);
  }
 #endif
-
+
-  Parallel::AsyncSyncState async_pre;
+  Parallel::AsyncSyncState async_pre;
-  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
+  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
 #ifdef WithShell
  if (lev == 0)
@@ -3455,9 +3512,9 @@ void bssn_class::Step(int lev, int YN)
           << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
           << " seconds! " << endl;
    }
-  }
+  }
-#endif
+#endif
-  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #ifdef WithShell
  // Complete non-blocking error reduction and check
@@ -3530,22 +3587,24 @@ void bssn_class::Step(int lev, int YN)
        Block *cg = BP->data;
        if (myrank == cg->rank)
        {
-#if (AGM == 0)
+#if (AGM == 0)
-          f_enforce_ga(cg->shape,
+          if (!use_cuda_resident_sync)
-                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+            f_enforce_ga(cg->shape,
-                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                       cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-#elif (AGM == 1)
+                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-          if (iter_count == 3)
+#elif (AGM == 1)
-            f_enforce_ga(cg->shape,
+          if (iter_count == 3 && !use_cuda_resident_sync)
-                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+            f_enforce_ga(cg->shape,
-                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                         cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                         cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-#endif
+                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-
+#endif
          bool used_gpu_substep = false;
          bool used_gpu_resident_state = false;
 #if USE_CUDA_BSSN
          {
            double *state_in[BSSN_CUDA_STATE_COUNT];
@@ -3563,6 +3622,13 @@ void bssn_class::Step(int lev, int YN)
              MPI_Abort(MPI_COMM_WORLD, 1);
            }
            int apply_bam_bc = 0;
            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
            int apply_enforce_ga = 0;
 #if (AGM == 0)
            apply_enforce_ga = 1;
 #elif (AGM == 1)
            apply_enforce_ga = (iter_count == 3) ? 1 : 0;
 #endif
 #if (SommerType == 0)
 #ifndef WithShell
            apply_bam_bc = (lev == 0) ? 1 : 0;
@@ -3573,7 +3639,8 @@ void bssn_class::Step(int lev, int YN)
                                      state_in, state_out, matter,
                                      propspeed, soa_flat, Pp->data->bbox,
                                      dT_lev, TRK4, iter_count, apply_bam_bc,
-                                      Symmetry, lev, ndeps, cor))
+                                      Symmetry, lev, ndeps, cor,
                                      keep_resident_state, apply_enforce_ga, chitiny))
            {
              cout << "CUDA corrector substep failed in domain: ("
                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
@@ -3582,6 +3649,7 @@ void bssn_class::Step(int lev, int YN)
              ERROR = 1;
            }
            used_gpu_substep = true;
            used_gpu_resident_state = (keep_resident_state != 0);
          }
 #endif
          if (!used_gpu_substep)
@@ -3686,8 +3754,9 @@ void bssn_class::Step(int lev, int YN)
              varlrhs = varlrhs->next;
            }
          }
-          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
+          if (!used_gpu_resident_state)
-        }
+            f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
        }
        if (BP == Pp->data->ble)
          break;
        BP = BP->next;
@@ -3842,9 +3911,9 @@ void bssn_class::Step(int lev, int YN)
      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
    }
 #endif
-
+
-    Parallel::AsyncSyncState async_cor;
+    Parallel::AsyncSyncState async_cor;
-    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
+    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
 #ifdef WithShell
    if (lev == 0)
@@ -3862,8 +3931,8 @@ void bssn_class::Step(int lev, int YN)
             << " seconds! " << endl;
      }
    }
-#endif
+#endif
-    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #ifdef WithShell
    // Complete non-blocking error reduction and check
@@ -3968,10 +4037,14 @@ void bssn_class::Step(int lev, int YN)
      }
 #endif
    }
-  }
+  }
-#if (RPS == 0)
+#if USE_CUDA_BSSN
-  // mesh refinement boundary part
+  if (use_cuda_resident_sync)
-  RestrictProlong(lev, YN, BB);
+    bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank);
 #endif
 #if (RPS == 0)
  // mesh refinement boundary part
  RestrictProlong(lev, YN, BB);
 #ifdef WithShell
  if (lev == 0)
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -6,6 +6,7 @@
 */
 #include <array>
 #include <chrono>
 #include <cstdio>
 #include <cstdlib>
 #include <cmath>
@@ -62,6 +63,72 @@ static void init_gpu_dispatch() {
    }
    g_dispatch.inited = true;
 }
 struct CudaProfileStats {
    long long calls;
    double total_ms;
    double state_ms;
    double matter_ms;
    double rhs_ms;
    double bc_ms;
    double finalize_ms;
    double output_ms;
 };
 static CudaProfileStats &cuda_profile_stats() {
    static CudaProfileStats stats = {0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
    return stats;
 }
 static bool cuda_profile_enabled() {
    static int enabled = -1;
    if (enabled < 0) {
        const char *env = getenv("AMSS_PROFILE_CUDA");
        enabled = (env && atoi(env) != 0) ? 1 : 0;
    }
    return enabled != 0;
 }
 static int cuda_profile_every() {
    static int every = -1;
    if (every < 0) {
        const char *env = getenv("AMSS_PROFILE_CUDA_EVERY");
        every = (env && atoi(env) > 0) ? atoi(env) : 100;
    }
    return every;
 }
 static double cuda_profile_now_ms() {
    using clock = std::chrono::steady_clock;
    return std::chrono::duration<double, std::milli>(
               clock::now().time_since_epoch()).count();
 }
 static void cuda_profile_sync() {
    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        fprintf(stderr, "CUDA error %s:%d: %s\n",
                __FILE__, __LINE__, cudaGetErrorString(err));
        exit(EXIT_FAILURE);
    }
 }
 static void cuda_profile_maybe_log() {
    if (!cuda_profile_enabled()) return;
    CudaProfileStats &stats = cuda_profile_stats();
    if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
    fprintf(stderr,
            "[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms\n",
            g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
            stats.total_ms / (double)stats.calls,
            stats.state_ms / (double)stats.calls,
            stats.matter_ms / (double)stats.calls,
            stats.rhs_ms / (double)stats.calls,
            stats.bc_ms / (double)stats.calls,
            stats.finalize_ms / (double)stats.calls,
            stats.output_ms / (double)stats.calls);
    fflush(stderr);
 }
 /* ------------------------------------------------------------------ */
 /*  Error checking                                                     */
@@ -248,12 +315,17 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = {
 struct StepContext {
    double *d_state0_mem;
    double *d_accum_mem;
    double *d_state_curr_mem;
    double *d_state_next_mem;
    double *d_matter_mem;
    std::array<double *, BSSN_STATE_COUNT> d_state0;
    std::array<double *, BSSN_STATE_COUNT> d_accum;
    std::array<double *, BSSN_STATE_COUNT> d_state_curr;
    std::array<double *, BSSN_STATE_COUNT> d_state_next;
    std::array<double *, BSSN_MATTER_COUNT> d_matter;
    size_t cap_all;
    bool matter_ready;
    bool state_ready;
 };
 static std::unordered_map<void *, StepContext> g_step_ctx;
@@ -321,19 +393,32 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
            cudaFree(ctx.d_accum_mem);
            ctx.d_accum_mem = nullptr;
        }
        if (ctx.d_state_curr_mem) {
            cudaFree(ctx.d_state_curr_mem);
            ctx.d_state_curr_mem = nullptr;
        }
        if (ctx.d_state_next_mem) {
            cudaFree(ctx.d_state_next_mem);
            ctx.d_state_next_mem = nullptr;
        }
        if (ctx.d_matter_mem) {
            cudaFree(ctx.d_matter_mem);
            ctx.d_matter_mem = nullptr;
        }
        CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&ctx.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&ctx.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double)));
        CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
        ctx.cap_all = all;
        ctx.matter_ready = false;
        ctx.state_ready = false;
    }
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
        ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
        ctx.d_state_curr[i] = ctx.d_state_curr_mem + (size_t)i * all;
        ctx.d_state_next[i] = ctx.d_state_next_mem + (size_t)i * all;
    }
    for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
        ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
@@ -347,6 +432,8 @@ static void release_step_ctx(void *block_tag)
    if (it == g_step_ctx.end()) return;
    if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem);
    if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem);
    if (it->second.d_state_curr_mem) cudaFree(it->second.d_state_curr_mem);
    if (it->second.d_state_next_mem) cudaFree(it->second.d_state_next_mem);
    if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem);
    g_step_ctx.erase(it);
 }
@@ -1050,6 +1137,76 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0,
    }
 }
 __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
                                     double * __restrict__ gxy,
                                     double * __restrict__ gxz,
                                     double * __restrict__ dyy,
                                     double * __restrict__ gyz,
                                     double * __restrict__ dzz,
                                     double * __restrict__ Axx,
                                     double * __restrict__ Axy,
                                     double * __restrict__ Axz,
                                     double * __restrict__ Ayy,
                                     double * __restrict__ Ayz,
                                     double * __restrict__ Azz)
 {
    constexpr double F1O3 = 1.0 / 3.0;
    constexpr double ONE = 1.0;
    constexpr double TWO = 2.0;
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
         i < d_gp.all;
         i += blockDim.x * gridDim.x)
    {
        double lgxx = dxx[i] + ONE;
        double lgyy = dyy[i] + ONE;
        double lgzz = dzz[i] + ONE;
        double lgxy = gxy[i];
        double lgxz = gxz[i];
        double lgyz = gyz[i];
        double lscale = lgxx * lgyy * lgzz
                      + lgxy * lgyz * lgxz
                      + lgxz * lgxy * lgyz
                      - lgxz * lgyy * lgxz
                      - lgxy * lgxy * lgzz
                      - lgxx * lgyz * lgyz;
        lscale = ONE / cbrt(lscale);
        lgxx *= lscale;
        lgxy *= lscale;
        lgxz *= lscale;
        lgyy *= lscale;
        lgyz *= lscale;
        lgzz *= lscale;
        dxx[i] = lgxx - ONE;
        gxy[i] = lgxy;
        gxz[i] = lgxz;
        dyy[i] = lgyy - ONE;
        gyz[i] = lgyz;
        dzz[i] = lgzz - ONE;
        const double lgupxx =   (lgyy * lgzz - lgyz * lgyz);
        const double lgupxy = - (lgxy * lgzz - lgyz * lgxz);
        const double lgupxz =   (lgxy * lgyz - lgyy * lgxz);
        const double lgupyy =   (lgxx * lgzz - lgxz * lgxz);
        const double lgupyz = - (lgxx * lgyz - lgxy * lgxz);
        const double lgupzz =   (lgxx * lgyy - lgxy * lgxy);
        const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i]
                          + TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]);
        Axx[i] -= F1O3 * lgxx * ltrA;
        Axy[i] -= F1O3 * lgxy * ltrA;
        Axz[i] -= F1O3 * lgxz * ltrA;
        Ayy[i] -= F1O3 * lgyy * ltrA;
        Ayz[i] -= F1O3 * lgyz * ltrA;
        Azz[i] -= F1O3 * lgzz * ltrA;
    }
 }
 __global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
 {
    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -2429,6 +2586,20 @@ static void bind_matter_slots(const StepContext &ctx)
    }
 }
 static void bind_state_input_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
 {
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        g_buf.slot[k_state_input_slots[i]] = state[i];
    }
 }
 static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
 {
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        g_buf.slot[k_state_rhs_slots[i]] = state[i];
    }
 }
 static void launch_rhs_pipeline(int all, double eps, int co)
 {
    const double SYM = 1.0;
@@ -2763,6 +2934,87 @@ static void download_state_outputs(double **state_host_out, size_t all)
    }
 }
 static void copy_state_region_cuda(void *block_tag,
                                   int state_index,
                                   double *host_state,
                                   const int *ex,
                                   int i0, int j0, int k0,
                                   int sx, int sy, int sz,
                                   cudaMemcpyKind kind)
 {
    if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
    if (sx <= 0 || sy <= 0 || sz <= 0) return;
    const size_t pitch = (size_t)ex[0] * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    cudaMemcpy3DParms p = {};
    p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
    p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
    p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
    if (kind == cudaMemcpyDeviceToHost) {
        p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
        p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
    } else {
        p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
        p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], pitch, ex[0], ex[1]);
    }
    CUDA_CHECK(cudaMemcpy3D(&p));
 }
 static void copy_state_region_packed_cuda(void *block_tag,
                                          int state_index,
                                          double *host_buffer,
                                          const int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz,
                                          cudaMemcpyKind kind)
 {
    if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
    if (sx <= 0 || sy <= 0 || sz <= 0) return;
    const size_t src_pitch = (size_t)ex[0] * sizeof(double);
    const size_t dst_pitch = (size_t)sx * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
    cudaMemcpy3DParms p = {};
    p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
    if (kind == cudaMemcpyDeviceToHost) {
        p.srcPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
        p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
        p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
        p.dstPos = make_cudaPos(0, 0, 0);
    } else {
        p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
        p.srcPos = make_cudaPos(0, 0, 0);
        p.dstPtr = make_cudaPitchedPtr((void *)ctx.d_state_curr[state_index], src_pitch, ex[0], ex[1]);
        p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
    }
    CUDA_CHECK(cudaMemcpy3D(&p));
 }
 static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
 {
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    StepContext &ctx = ensure_step_ctx(block_tag, all);
    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_state_curr_mem,
                          (size_t)BSSN_STATE_COUNT * bytes,
                          cudaMemcpyDeviceToHost));
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
    }
 }
 static bool has_resident_state(void *block_tag)
 {
    auto it = g_step_ctx.find(block_tag);
    return it != g_step_ctx.end() && it->second.state_ready;
 }
 /* ================================================================== */
 /*  Main host function — drop-in replacement for bssn_rhs_c.C          */
 /* ================================================================== */
@@ -3266,22 +3518,53 @@ int bssn_cuda_rk4_substep(void *block_tag,
                          int &Symmetry,
                          int &Lev,
                          double &eps,
-                          int &co)
+                          int &co,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny)
 {
    (void)T;
-    (void)Lev;
+    (void)state_host_out;
    if (RK4 < 0 || RK4 > 3) return 1;
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    const bool profile = cuda_profile_enabled();
    const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
    double state_ms = 0.0;
    double matter_ms = 0.0;
    double rhs_ms = 0.0;
    double bc_ms = 0.0;
    double finalize_ms = 0.0;
    double output_ms = 0.0;
    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
    const size_t bytes = all * sizeof(double);
    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
    StepContext &ctx = ensure_step_ctx(block_tag, all);
-    upload_state_inputs(state_host_in, all);
+    const bool use_resident_state = (keep_resident_state != 0);
    if (use_resident_state) {
        bind_state_input_slots(ctx.d_state_curr);
        bind_state_output_slots(ctx.d_state_next);
    }
    double t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (!use_resident_state || RK4 == 0 || !ctx.state_ready) {
        upload_state_inputs(state_host_in, all);
    }
    if (apply_enforce_ga) {
        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
    }
    if (profile) {
        cuda_profile_sync();
        state_ms += cuda_profile_now_ms() - t0;
    }
    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (RK4 == 0) {
        upload_matter_cache(ctx, matter_host, all);
        CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
@@ -3291,9 +3574,19 @@ int bssn_cuda_rk4_substep(void *block_tag,
        upload_matter_cache(ctx, matter_host, all);
    }
    bind_matter_slots(ctx);
    if (profile) {
        cuda_profile_sync();
        matter_ms += cuda_profile_now_ms() - t0;
    }
    t0 = profile ? cuda_profile_now_ms() : 0.0;
    launch_rhs_pipeline((int)all, eps, co);
    if (profile) {
        cuda_profile_sync();
        rhs_ms += cuda_profile_now_ms() - t0;
    }
    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (apply_bam_bc) {
        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
            gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
@@ -3305,7 +3598,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
                                   X, Y, Z, bbox, Symmetry);
        }
    }
    if (profile) {
        cuda_profile_sync();
        bc_ms += cuda_profile_now_ms() - t0;
    }
    t0 = profile ? cuda_profile_now_ms() : 0.0;
    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
        kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[i],
                                              g_buf.slot[k_state_rhs_slots[i]],
@@ -3314,13 +3612,119 @@ int bssn_cuda_rk4_substep(void *block_tag,
                                              RK4);
    }
-    download_state_outputs(state_host_out, all);
+    kern_lowerboundset_cuda<<<grid(all), BLK>>>(g_buf.slot[S_chi_rhs], chitiny);
-    if (RK4 == 3) {
+    if (profile) {
        cuda_profile_sync();
        finalize_ms += cuda_profile_now_ms() - t0;
    }
    t0 = profile ? cuda_profile_now_ms() : 0.0;
    if (use_resident_state) {
        std::swap(ctx.d_state_curr_mem, ctx.d_state_next_mem);
        ctx.d_state_curr.swap(ctx.d_state_next);
        ctx.state_ready = true;
    } else {
        download_state_outputs(state_host_out, all);
    }
    if (RK4 == 3 && !use_resident_state) {
        release_step_ctx(block_tag);
    }
    if (profile) {
        cuda_profile_sync();
        output_ms += cuda_profile_now_ms() - t0;
        CudaProfileStats &stats = cuda_profile_stats();
        stats.calls++;
        stats.total_ms += cuda_profile_now_ms() - t_total0;
        stats.state_ms += state_ms;
        stats.matter_ms += matter_ms;
        stats.rhs_ms += rhs_ms;
        stats.bc_ms += bc_ms;
        stats.finalize_ms += finalize_ms;
        stats.output_ms += output_ms;
        cuda_profile_maybe_log();
    }
    return 0;
 }
 extern "C"
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                        int state_index,
                                        double *host_state,
                                        int *ex,
                                        int i0, int j0, int k0,
                                        int sx, int sy, int sz)
 {
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_cuda(block_tag, state_index, host_state, ex,
                           i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
    return 0;
 }
 extern "C"
 int bssn_cuda_copy_state_region_from_host(void *block_tag,
                                          int state_index,
                                          double *host_state,
                                          int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz)
 {
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_cuda(block_tag, state_index, host_state, ex,
                           i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
    return 0;
 }
 extern "C"
 int bssn_cuda_download_resident_state(void *block_tag,
                                      int *ex,
                                      double **state_host_out)
 {
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    download_resident_state(block_tag, ex, state_host_out);
    return 0;
 }
 extern "C"
 int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                               int state_index,
                                               double *host_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz)
 {
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                  i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
    return 0;
 }
 extern "C"
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz)
 {
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                  i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
    return 0;
 }
 extern "C"
 int bssn_cuda_has_resident_state(void *block_tag)
 {
    init_gpu_dispatch();
    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
    return has_resident_state(block_tag) ? 1 : 0;
 }
 extern "C"
 void bssn_cuda_release_step_ctx(void *block_tag)
 {
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -49,7 +49,44 @@ int bssn_cuda_rk4_substep(void *block_tag,
                          int &Symmetry,
                          int &Lev,
                          double &eps,
-                          int &co);
+                          int &co,
                          int &keep_resident_state,
                          int &apply_enforce_ga,
                          double &chitiny);
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                        int state_index,
                                        double *host_state,
                                        int *ex,
                                        int i0, int j0, int k0,
                                        int sx, int sy, int sz);
 int bssn_cuda_copy_state_region_from_host(void *block_tag,
                                          int state_index,
                                          double *host_state,
                                          int *ex,
                                          int i0, int j0, int k0,
                                          int sx, int sy, int sz);
 int bssn_cuda_download_resident_state(void *block_tag,
                                      int *ex,
                                      double **state_host_out);
 int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                               int state_index,
                                               double *host_buffer,
                                               int *ex,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,
                                                   int *ex,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int bssn_cuda_has_resident_state(void *block_tag);
 void bssn_cuda_release_step_ctx(void *block_tag);