Save Z4C CUDA optimization progress

2026-05-02 00:49:02 +08:00
parent 531b31e8db
commit 383e936e88
6 changed files with 343 additions and 66 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -335,7 +335,7 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg
    if (z4c_amr_device < 0)
    {
      const char *env = getenv("AMSS_CUDA_Z4C_AMR_DEVICE");
-      z4c_amr_device = (env && atoi(env) != 0) ? 1 : 0;
+      z4c_amr_device = (!env || atoi(env) != 0) ? 1 : 0;
    }
    if (!z4c_amr_device)
      return false;
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -228,7 +228,13 @@ bool z4c_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
  if (enabled < 0)
  {
    const char *env = getenv("AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP");
-    enabled = (env && atoi(env) != 0) ? 1 : 0;
+    if (env)
+      enabled = (atoi(env) != 0) ? 1 : 0;
+    else
+    {
+      env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
+      enabled = (env && atoi(env) != 0) ? 1 : 0;
+    }
  }
  if (!enabled)
    return false;
@@ -478,6 +484,89 @@ bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
  return true;
 }

+bool z4c_cuda_download_bh_shift_level(MyList<Patch> *PatL,
+                                      int myrank,
+                                      var *forx, var *fory, var *forz)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
+      {
+        double *fields[3] = {
+            cg->fgfs[forx->sgfn],
+            cg->fgfs[fory->sgfn],
+            cg->fgfs[forz->sgfn]};
+        if (z4c_cuda_download_state_subset(cg, cg->shape, 3,
+                                           k_z4c_cuda_bh_state_indices,
+                                           fields))
+          return false;
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+  return true;
+}
+
+bool z4c_cuda_refresh_constraint_level(MyList<Patch> *PatL,
+                                       int myrank,
+                                       var *Cons_Ham, var *Cons_Px,
+                                       var *Cons_Py, var *Cons_Pz,
+                                       var *Cons_Gx, var *Cons_Gy,
+                                       var *Cons_Gz, var *TZ0,
+                                       int Symmetry, int lev, double eps)
+{
+  bool all_resident = true;
+  const int tz_index = 24;
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank)
+      {
+        if (!z4c_cuda_has_resident_state(cg))
+        {
+          all_resident = false;
+        }
+        else
+        {
+          double *constraints[7] = {
+              cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
+              cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+              cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
+              cg->fgfs[Cons_Gz->sgfn]};
+          double *tz_out[1] = {cg->fgfs[TZ0->sgfn]};
+          int co = 0;
+          if (z4c_cuda_compute_constraints_resident(cg, cg->shape,
+                                                   cg->X[0], cg->X[1], cg->X[2],
+                                                   Symmetry, eps, co,
+                                                   constraints) ||
+              z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out))
+          {
+            cout << "CUDA Z4C resident constraint refresh failed" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+        }
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+  return all_resident;
+}
+
 } // namespace
 #endif

@@ -496,6 +585,33 @@ void Z4c_class::Step(int lev, int YN)
  int iter_count = 0;
  int pre = 0, cor = 1;
  int ERROR = 0;
+  const double dT_mon = dT * pow(0.5, Mymax(0, trfls));
+  const bool need_constraint_after_step = (LastConsOut + dT_mon >= AnasTime);
+
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    if (!z4c_cuda_download_bh_shift_level(GH->PatL[lev], myrank, Sfx0, Sfy0, Sfz0))
+    {
+      if (myrank == 0 && ErrorMonitor->outfile)
+        ErrorMonitor->outfile << "CUDA Z4C failed to download predictor black-hole shift at t = "
+                              << PhysTime << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
+      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
+      if (Symmetry > 0)
+        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
+      if (Symmetry == 2)
+      {
+        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
+        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
+      }
+    }
+  }

  MyList<Patch> *Pp = GH->PatL[lev];
  while (Pp)
@@ -565,24 +681,6 @@ void Z4c_class::Step(int lev, int YN)

  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);

-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-      if (Symmetry > 0)
-        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
-      if (Symmetry == 2)
-      {
-        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
-        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
-      }
-    }
-  }
-
  if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
    z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
  if (lev == a_lev)
@@ -640,6 +738,25 @@ void Z4c_class::Step(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
+          if (!ERROR && iter_count == 3 && need_constraint_after_step)
+          {
+            double *constraints[7] = {
+                cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
+                cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
+                cg->fgfs[Cons_Gz->sgfn]};
+            double *tz_out[1] = {cg->fgfs[TZ0->sgfn]};
+            const int tz_index = 24;
+            if (z4c_cuda_download_constraint_outputs(cg->shape, constraints) ||
+                z4c_cuda_download_state_subset(cg, cg->shape, 1, &tz_index, tz_out))
+            {
+              cout << "CUDA Z4C constraint download failed in domain: ("
+                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+              ERROR = 1;
+            }
+          }
        }
        if (BP == Pp->data->ble)
          break;
@@ -719,7 +836,10 @@ void Z4c_class::Step(int lev, int YN)

  {
    const bool keep_resident = z4c_cuda_keep_resident_after_step(lev, trfls, a_lev);
-    z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident);
+    const bool need_host_after_step =
+        ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime));
+    if (!keep_resident || need_host_after_step)
+      z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, !keep_resident);
  }

 #if (RPS == 0)
@@ -2991,17 +3111,23 @@ void Z4c_class::Check_extrop()

 //================================================================================================

-void Z4c_class::Constraint_Out()
-{
-  // here we have to use the same variable name as in the parent class
-  LastConsOut += dT * pow(0.5, Mymax(0, trfls));
-  
-  if (LastConsOut >= AnasTime)
-  // Constraint violation
-  {
-    // recompute least the constraint data lost for moved new grid
-    for (int lev = 0; lev < GH->levels; lev++)
-    {
+void Z4c_class::Constraint_Out()
+{
+  // here we have to use the same variable name as in the parent class
+  LastConsOut += dT * pow(0.5, Mymax(0, trfls));
+  
+  if (LastConsOut >= AnasTime)
+  // Constraint violation
+  {
+#if USE_CUDA_Z4C && (ABEtype == 2)
+    bool cuda_constraints_ready = true;
+#else
+    const bool cuda_constraints_ready = false;
+#endif
+    // recompute least the constraint data lost for moved new grid
+    if (!cuda_constraints_ready)
+      for (int lev = 0; lev < GH->levels; lev++)
+    {
      // make sure the data consistent for higher levels
      if (lev > 0)
      {
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -28,6 +28,9 @@ using namespace std;
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
 #endif
+#if USE_CUDA_Z4C && (ABEtype == 2)
+#include "z4c_rhs_cuda.h"
+#endif
 #include "initial_puncture.h"
 #include "enforce_algebra.h"
 #include "rungekutta4_rout.h"
@@ -36,6 +39,12 @@ using namespace std;
 #include "shellfunctions.h"
 #include "parameters.h"

+#if (ABEtype == 1) || ((ABEtype == 2) && !USE_CUDA_Z4C)
+#define AMSS_LEGACY_ABE_TRANSFER 1
+#else
+#define AMSS_LEGACY_ABE_TRANSFER 0
+#endif
+
 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
@@ -647,6 +656,87 @@ void bssn_cuda_flush_level_before_regrid(MyList<Patch> *PatL,
  bssn_cuda_release_level_state(PatL, myrank);
 }

+#if USE_CUDA_Z4C && (ABEtype == 2)
+bool fill_z4c_cuda_views_for_regrid(Block *cg, MyList<var> *vars,
+                                    double **host_views)
+{
+  int idx = 0;
+  while (vars && idx < Z4C_CUDA_STATE_COUNT)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
+}
+
+void z4c_cuda_download_level_state_if_present_for_regrid(MyList<Patch> *PatL,
+                                                         MyList<var> *vars,
+                                                         int myrank)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
+      {
+        double *state_out[Z4C_CUDA_STATE_COUNT];
+        if (!fill_z4c_cuda_views_for_regrid(cg, vars, state_out))
+        {
+          cout << "CUDA Z4C state list mismatch on regrid flush" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
+        {
+          cout << "CUDA Z4C resident state regrid download failed" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+
+void z4c_cuda_release_level_state_for_regrid(MyList<Patch> *PatL, int myrank)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
+        z4c_cuda_release_step_ctx(cg);
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+
+void z4c_cuda_flush_level_before_regrid(MyList<Patch> *PatL,
+                                        MyList<var> *corL,
+                                        MyList<var> *oldL,
+                                        MyList<var> *stateL,
+                                        MyList<var> *preL,
+                                        int myrank)
+{
+  z4c_cuda_download_level_state_if_present_for_regrid(PatL, corL, myrank);
+  z4c_cuda_download_level_state_if_present_for_regrid(PatL, oldL, myrank);
+  z4c_cuda_download_level_state_if_present_for_regrid(PatL, stateL, myrank);
+  z4c_cuda_download_level_state_if_present_for_regrid(PatL, preL, myrank);
+  z4c_cuda_release_level_state_for_regrid(PatL, myrank);
+}
+#endif
+
 bool bssn_cuda_regrid_flush_enabled()
 {
  static int enabled = -1;
@@ -2969,6 +3059,10 @@ void bssn_class::Evolve(int Steps)
      STEP_TIMER_DECL(timer_dump3d);
      //       misc::tillherecheck("before Dump_Data");

+#if USE_CUDA_Z4C && (ABEtype == 2)
+      for (int lev = 0; lev < GH->levels; lev++)
+        z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank);
+#endif
      for (int lev = 0; lev < GH->levels; lev++)
        Parallel::Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
 #ifdef WithShell
@@ -2990,6 +3084,10 @@ void bssn_class::Evolve(int Steps)
      STEP_TIMER_DECL(timer_dump2d);
      //       misc::tillherecheck("before 2dDump_Data");

+#if USE_CUDA_Z4C && (ABEtype == 2)
+      for (int lev = 0; lev < GH->levels; lev++)
+        z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank);
+#endif
      for (int lev = 0; lev < GH->levels; lev++)
        Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
      STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d);
@@ -3018,13 +3116,21 @@ void bssn_class::Evolve(int Steps)

 #if (REGLEV == 1)
    STEP_TIMER_DECL(timer_regrid);
-#if USE_CUDA_BSSN
+#if USE_CUDA_BSSN && (ABEtype != 2)
    for (int il = 0; il < GH->levels; il++)
      if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0))
        bssn_cuda_flush_level_before_regrid(GH->PatL[il],
                                            SynchList_cor, OldStateList,
                                            StateList, SynchList_pre,
                                            myrank);
+#endif
+#if USE_CUDA_Z4C && USE_CUDA_BSSN && (ABEtype == 2)
+    for (int il = 0; il < GH->levels; il++)
+      if (bssn_cuda_should_flush_before_regrid(GH, il, Symmetry, BH_num, Porg0))
+        z4c_cuda_flush_level_before_regrid(GH->PatL[il],
+                                           SynchList_cor, OldStateList,
+                                           StateList, SynchList_pre,
+                                           myrank);
 #endif
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
@@ -3113,6 +3219,10 @@ void bssn_class::Evolve(int Steps)
      STEP_TIMER_DECL(timer_checkpoint);
      LastCheck = 0;

+#if USE_CUDA_Z4C && (ABEtype == 2)
+      for (int lev = 0; lev < GH->levels; lev++)
+        z4c_cuda_download_level_state_if_present_for_regrid(GH->PatL[lev], StateList, myrank);
+#endif
      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
      CheckPoint->writecheck_cgh(PhysTime, GH);
 #ifdef WithShell
@@ -4346,7 +4456,7 @@ void bssn_class::Step(int lev, int YN)

  STEP_TIMER_DECL(timer_predictor_sync);
  Parallel::AsyncSyncState async_pre;
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #else
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -4369,7 +4479,7 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if !AMSS_LEGACY_ABE_TRANSFER
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #endif
  STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
@@ -4793,7 +4903,7 @@ void bssn_class::Step(int lev, int YN)

    STEP_TIMER_DECL(timer_corrector_sync);
    Parallel::AsyncSyncState async_cor;
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -4816,7 +4926,7 @@ void bssn_class::Step(int lev, int YN)
      }
    }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if !AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #endif
    STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
@@ -5312,7 +5422,7 @@ void bssn_class::Step(int lev, int YN)
 #endif

  Parallel::AsyncSyncState async_pre;
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #else
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -5335,7 +5445,7 @@ void bssn_class::Step(int lev, int YN)
    }
  }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if !AMSS_LEGACY_ABE_TRANSFER
  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
 #endif

@@ -5663,7 +5773,7 @@ void bssn_class::Step(int lev, int YN)
 #endif

    Parallel::AsyncSyncState async_cor;
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -5686,7 +5796,7 @@ void bssn_class::Step(int lev, int YN)
      }
    }
 #endif
-#if (ABEtype != 1 && ABEtype != 2)
+#if !AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
 #endif

@@ -6080,7 +6190,7 @@ void bssn_class::Step(int lev, int YN)

  //   misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync");

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
 #else
  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
@@ -6285,7 +6395,7 @@ void bssn_class::Step(int lev, int YN)

    //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync");

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
@@ -6926,7 +7036,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #else
      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -6943,7 +7053,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -6960,7 +7070,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
@@ -7001,7 +7111,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #else
      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
@@ -7018,7 +7128,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //       misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7035,7 +7145,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
@@ -7066,7 +7176,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
    }

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
 #else
    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
@@ -7124,7 +7234,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      }

 #if (RPB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #else
      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -7134,7 +7244,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7144,7 +7254,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
@@ -7170,7 +7280,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #else
      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
@@ -7180,7 +7290,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
 #endif

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7190,7 +7300,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
@@ -7214,7 +7324,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
 #endif
    }

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync(GH->PatL[lev], SL, Symmetry);
 #else
    Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
@@ -7265,7 +7375,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      }

 #if (RPB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
 #else
      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -7275,7 +7385,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
 #endif

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7285,7 +7395,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
@@ -7313,7 +7423,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #else
      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
@@ -7323,7 +7433,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
 #endif

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7333,7 +7443,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      Ppc = GH->PatL[lev - 1];
      while (Ppc)
      {
@@ -7357,7 +7467,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #endif
    }

-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
@@ -9065,7 +9175,7 @@ void bssn_class::AH_Step_Find(int lev, double dT_lev)

          ihn++;
        }
-#if (ABEtype == 1 || ABEtype == 2)
+#if AMSS_LEGACY_ABE_TRANSFER
      if (PhysTime > 10)
      {
        ihn--;
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
@@ -7788,7 +7788,7 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
    }

    double t0 = profile ? cuda_profile_now_ms() : 0.0;
-    if (!use_resident_state || RK4 == 0 || !ctx.state_ready) {
+    if (!use_resident_state || !ctx.state_ready) {
        upload_state_inputs(state_host_in, all);
    }
    if (apply_enforce_ga) {
@@ -8117,6 +8117,35 @@ extern "C" int z4c_cuda_upload_state_subset(void *block_tag,
    return 0;
 }

+extern "C" int z4c_cuda_compute_constraints_resident(void *block_tag,
+                                                     int *ex, double *X, double *Y, double *Z,
+                                                     int Symmetry, double eps, int co,
+                                                     double **constraint_host_out)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (!block_tag || !ex || !constraint_host_out) return 1;
+    StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
+    if (!ctx.state_ready) return 1;
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
+    bind_state_input_slots(ctx.d_state_curr);
+    launch_z4c_rhs_pipeline((int)((size_t)ex[0] * ex[1] * ex[2]), eps);
+    download_constraint_outputs(constraint_host_out, (size_t)ex[0] * ex[1] * ex[2]);
+    return 0;
+}
+
+extern "C" int z4c_cuda_download_constraint_outputs(int *ex,
+                                                    double **constraint_host_out)
+{
+    using namespace z4c_cuda;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (!ex || !constraint_host_out) return 1;
+    download_constraint_outputs(constraint_host_out, (size_t)ex[0] * ex[1] * ex[2]);
+    return 0;
+}
+
 extern "C" int z4c_cuda_has_resident_state(void *block_tag)
 {
    using namespace z4c_cuda;
--- a/AMSS_NCKU_source/z4c_rhs_cuda.h
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.h
@@ -133,6 +133,14 @@ int z4c_cuda_upload_state_subset(void *block_tag,
                                 const int *state_indices,
                                 double **state_host_in);

+int z4c_cuda_compute_constraints_resident(void *block_tag,
+                                          int *ex, double *X, double *Y, double *Z,
+                                          int Symmetry, double eps, int co,
+                                          double **constraint_host_out);
+
+int z4c_cuda_download_constraint_outputs(int *ex,
+                                         double **constraint_host_out);
+
 int z4c_cuda_has_resident_state(void *block_tag);

 void z4c_cuda_release_step_ctx(void *block_tag);
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -145,7 +145,9 @@ def _gpu_runtime_env():
        "AMSS_ANALYSIS_MAP_EVERY": "1000000",
        "AMSS_CUDA_AWARE_MPI": "1",
        "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
+        "AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP": "1",
        "AMSS_CUDA_KEEP_ALL_LEVELS": "1",
+        "AMSS_CUDA_Z4C_AMR_DEVICE": "1",
        "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
        "AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
        "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
@@ -276,7 +278,9 @@ def run_ABE():
        print(f"   AMSS_ANALYSIS_MAP_EVERY={mpi_env.get('AMSS_ANALYSIS_MAP_EVERY', '')}")
        print(f"   AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}")
        print(f"   AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
+        print(f"   AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP', '')}")
        print(f"   AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
+        print(f"   AMSS_CUDA_Z4C_AMR_DEVICE={mpi_env.get('AMSS_CUDA_Z4C_AMR_DEVICE', '')}")
        print(f"   AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
        print(f"   AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
        print(f"   AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")