From 85fe29cc2ee710518b70d426aed7fe9958ad3666 Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Tue, 5 May 2026 10:47:46 +0800
Subject: [PATCH] Optimize BSSN-EScalar CUDA path

---
 AMSS_NCKU_source/ABE.C               |   6 +-
 AMSS_NCKU_source/Parallel.C          | 161 +++--
 AMSS_NCKU_source/bssnEScalar_class.C | 612 ++++++++++++++----
 AMSS_NCKU_source/bssn_class.C        | 340 +++++++++-
 AMSS_NCKU_source/bssn_class.h        |   2 +-
 AMSS_NCKU_source/bssn_rhs_cuda.cu    | 935 ++++++++++++++++++++++++---
 AMSS_NCKU_source/bssn_rhs_cuda.h     |  38 ++
 AMSS_NCKU_source/makefile            |   1 -
 makefile_and_run.py                  |   2 +
 9 files changed, 1821 insertions(+), 276 deletions(-)
diff --git a/AMSS_NCKU_source/ABE.C b/AMSS_NCKU_source/ABE.C
index 9a4874e..82a30fe 100644
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -484,7 +484,11 @@ int main(int argc, char *argv[])
             cout << endl;
       }
 
-      delete ADM;
+      // Let the process teardown reclaim the simulation object.  Some derived
+      // equation classes keep MPI/CUDA-backed state whose destructor ordering
+      // is fragile at program shutdown.
+      if (getenv("AMSS_DELETE_ADM_ON_EXIT"))
+            delete ADM;
 
       //=======================caculation done=============================================================
 
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index 3174cc2..8c7d3fd 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -18,6 +18,7 @@
 #endif
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
+#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
 #endif
 #if USE_CUDA_Z4C
 #include "z4c_rhs_cuda.h"
@@ -179,10 +180,12 @@ bool cuda_build_bssn_host_views(Block *block,
                                 int state_count,
                                 double **views)
 {
-  if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT)
+  if (!block || !vars || !views ||
+      (state_count != BSSN_CUDA_STATE_COUNT &&
+       state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
     return false;
   MyList<var> *v = vars;
-  for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
+  for (int i = 0; i < state_count; ++i)
   {
     if (!v)
       return false;
@@ -196,10 +199,12 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
                          int state_count,
                          double *soa_flat)
 {
-  if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT)
+  if (!vars || !soa_flat ||
+      (state_count != BSSN_CUDA_STATE_COUNT &&
+       state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
     return false;
   MyList<var> *v = vars;
-  for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
+  for (int i = 0; i < state_count; ++i)
   {
     if (!v)
       return false;
@@ -317,7 +322,7 @@ bool cuda_state_count_direct_supported(int state_count)
 #if USE_CUDA_Z4C && (ABEtype == 2)
   return state_count == Z4C_CUDA_STATE_COUNT;
 #elif USE_CUDA_BSSN
-  return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
+  return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
 #else
   (void)state_count;
   return false;
@@ -372,22 +377,68 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
 #endif
 }
 
+bool cuda_amr_host_staged_enabled();
+double *alloc_device_comm_buffer(int length);
+void free_device_comm_buffer(double *&ptr);
+
+bool cuda_direct_pack_segment_to_device(double *buffer,
+                                        const Parallel::gridseg *src,
+                                        const Parallel::gridseg *dst,
+                                        int state_count,
+                                        int type,
+                                        MyList<var> *VarLists,
+                                        int Symmetry);
+
 bool cuda_direct_pack_segment(double *buffer,
                               const Parallel::gridseg *src,
                               const Parallel::gridseg *dst,
                               int state_count,
-                              MyList<var> *VarLists)
+                              int type,
+                              MyList<var> *VarLists,
+                              int Symmetry)
 {
 #if USE_CUDA_Z4C && (ABEtype == 2)
   if (state_count != Z4C_CUDA_STATE_COUNT)
     return false;
 #elif USE_CUDA_BSSN
-  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+  if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
 #else
   return false;
 #endif
   const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
+
+  if (type == 2 || type == 3)
+  {
+#if USE_CUDA_BSSN
+    if (!cuda_amr_host_staged_enabled())
+      return false;
+    const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
+    const int total = state_count * region_all;
+    static double *stage_dev = 0;
+    static int stage_cap = 0;
+    if (total > stage_cap)
+    {
+      free_device_comm_buffer(stage_dev);
+      stage_dev = alloc_device_comm_buffer(total);
+      stage_cap = total;
+    }
+    if (!cuda_direct_pack_segment_to_device(stage_dev, src, dst, state_count, type, VarLists, Symmetry))
+      return false;
+    cudaError_t cerr = cudaMemcpy(buffer, stage_dev, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost);
+    if (cerr != cudaSuccess)
+    {
+      fprintf(stderr, "Parallel: CUDA host-staged AMR pack cudaMemcpy failed, err=%d\n", (int)cerr);
+      return false;
+    }
+    if (sync_profile_enabled())
+      sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
+    return true;
+#else
+    return false;
+#endif
+  }
+
   const int i0 = cuda_seg_begin(dst, src->Bg, 0);
   const int j0 = cuda_seg_begin(dst, src->Bg, 1);
   const int k0 = cuda_seg_begin(dst, src->Bg, 2);
@@ -396,7 +447,7 @@ bool cuda_direct_pack_segment(double *buffer,
                                                            i0, j0, k0,
                                                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
-  double *views[BSSN_CUDA_STATE_COUNT];
+  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
   const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
   const bool ok = have_views
                       ? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(
@@ -422,7 +473,7 @@ bool cuda_direct_unpack_segment(double *buffer,
   if (state_count != Z4C_CUDA_STATE_COUNT)
     return false;
 #elif USE_CUDA_BSSN
-  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+  if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
 #else
   return false;
@@ -436,7 +487,7 @@ bool cuda_direct_unpack_segment(double *buffer,
                                                                i0, j0, k0,
                                                                dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
-  double *views[BSSN_CUDA_STATE_COUNT];
+  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
   const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
   const bool ok = have_views
                       ? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(
@@ -464,6 +515,17 @@ bool cuda_aware_mpi_enabled()
   return enabled != 0;
 }
 
+bool cuda_cached_device_buffers_enabled(int state_count)
+{
+#if USE_CUDA_BSSN
+  if (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
+    return false;
+#else
+  (void)state_count;
+#endif
+  return cuda_aware_mpi_enabled();
+}
+
 bool cuda_amr_restrict_device_enabled()
 {
   static int enabled = -1;
@@ -486,6 +548,17 @@ bool cuda_amr_prolong_device_enabled()
   return enabled != 0;
 }
 
+bool cuda_amr_host_staged_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_HOST_STAGED");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
 bool cuda_amr_restrict_compare_enabled()
 {
   static int enabled = -1;
@@ -627,12 +700,12 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
   }
 #endif
 #if USE_CUDA_BSSN
-  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+  if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
   const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
 	  bool ok = false;
-	  double *views[BSSN_CUDA_STATE_COUNT];
-	  double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
+	  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+	  double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT];
 	  const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
 	  const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat);
   if (type == 1)
@@ -812,13 +885,13 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
   }
 #endif
 #if USE_CUDA_BSSN
-  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+  if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
   const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
   const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
   const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
   const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
-  double *views[BSSN_CUDA_STATE_COUNT];
+  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
   const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
   const bool ok = have_views
                       ? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(
@@ -843,12 +916,12 @@ bool cuda_download_resident_subset_to_host(Block *block,
                                            int state_count)
 {
 #if USE_CUDA_BSSN
-  if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
+  if (!block || state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
   if (bssn_cuda_has_resident_state(block) == 0)
     return true;
-  int indices[BSSN_CUDA_STATE_COUNT];
-  double *views[BSSN_CUDA_STATE_COUNT];
+  int indices[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
   MyList<var> *v = vars;
   for (int i = 0; i < state_count; ++i)
   {
@@ -871,7 +944,7 @@ bool cuda_unpack_host_region_to_resident(Block *block,
                                          const Parallel::gridseg *dst)
 {
 #if USE_CUDA_BSSN
-  if (!block || !dst || state_index < 0 || state_index >= BSSN_CUDA_STATE_COUNT)
+  if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
   if (bssn_cuda_has_resident_state(block) == 0)
     return true;
@@ -895,7 +968,7 @@ bool cuda_device_state_count_supported(int state_count)
     return true;
 #endif
 #if USE_CUDA_BSSN
-  return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
+  return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
 #else
   (void)state_count;
   return false;
@@ -915,8 +988,8 @@ bool cuda_flush_device_segment_batch(Block *block,
     return true;
 	  const int stride = (dir == PACK && type == 3) ? 11 : 8;
 	  const int segment_count = (int)(meta.size() / stride);
-	  double *views[BSSN_CUDA_STATE_COUNT];
-	  double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
+		  double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+		  double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT];
 	  const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
 	  const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat);
 	  if (dir == PACK)
@@ -5022,14 +5095,17 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
         {
 #if USE_CUDA_BSSN || USE_CUDA_Z4C
           bool handled_by_cuda = false;
-          if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) &&
+          const bool host_staged_amr =
+              dir == PACK && !s_cuda_aware_pack_active && (type == 2 || type == 3) &&
+              cuda_amr_host_staged_enabled();
+          if (dir == PACK && (type == 1 || s_cuda_aware_pack_active || host_staged_amr) &&
               cuda_state_count_direct_supported(state_count) &&
               cuda_can_direct_pack(src->data, dst->data, type))
           {
             if (s_cuda_aware_pack_active) {
               handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
             } else {
-              handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists);
+              handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
             }
             if (!handled_by_cuda)
             {
@@ -5037,7 +5113,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
               MPI_Abort(MPI_COMM_WORLD, 1);
             }
           }
-          else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active) &&
+          else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active || host_staged_amr) &&
                    cuda_state_count_direct_supported(state_count) &&
                    cuda_can_direct_unpack(dst->data, type))
           {
@@ -5102,7 +5178,8 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
             if (cuda_state_count_direct_supported(state_count) &&
                 dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg))
             {
-              if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
+              if (type != 2 && type != 3 &&
+                  !cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
               {
                 cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
                 MPI_Abort(MPI_COMM_WORLD, 1);
@@ -5775,7 +5852,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
     cout << "Parallel::transfer_cached: variable lists do not match." << endl;
     MPI_Abort(MPI_COMM_WORLD, 1);
   }
-  if (cuda_aware_mpi_enabled())
+  if (cuda_cached_device_buffers_enabled(state_count))
   {
     for (int n = 0; n < cpusize; n++)
     {
@@ -6094,7 +6171,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
     cout << "Parallel::Sync_start: variable lists do not match." << endl;
     MPI_Abort(MPI_COMM_WORLD, 1);
   }
-  if (cuda_aware_mpi_enabled())
+  if (cuda_cached_device_buffers_enabled(state_count))
   {
     for (int n = 0; n < cpusize; n++)
     {
@@ -6976,16 +7053,16 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
     if (myrank == cg->rank)
     {
 #if USE_CUDA_BSSN
-      double *src1_views[BSSN_CUDA_STATE_COUNT];
-      double *src2_views[BSSN_CUDA_STATE_COUNT];
-      double *dst_views[BSSN_CUDA_STATE_COUNT];
+      double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+      double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+      double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
       const int state_count = cuda_state_var_count(VarList1, VarList2);
-      if (state_count == BSSN_CUDA_STATE_COUNT &&
+      if (cuda_state_count_direct_supported(state_count) &&
           cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
           cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
           cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
           bssn_cuda_has_resident_state(cg) &&
-          bssn_cuda_prepare_inter_time_level(cg, cg->shape,
+          bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count,
                                              src1_views, src2_views, 0, dst_views,
                                              2, tindex) == 0)
       {
@@ -7051,18 +7128,18 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
     if (myrank == cg->rank)
     {
 #if USE_CUDA_BSSN
-      double *src1_views[BSSN_CUDA_STATE_COUNT];
-      double *src2_views[BSSN_CUDA_STATE_COUNT];
-      double *src3_views[BSSN_CUDA_STATE_COUNT];
-      double *dst_views[BSSN_CUDA_STATE_COUNT];
+      double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+      double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+      double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
+      double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
       const int state_count = cuda_state_var_count(VarList1, VarList2);
-      if (state_count == BSSN_CUDA_STATE_COUNT &&
+      if (cuda_state_count_direct_supported(state_count) &&
           cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
           cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
           cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
           cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) &&
           bssn_cuda_has_resident_state(cg) &&
-          bssn_cuda_prepare_inter_time_level(cg, cg->shape,
+          bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count,
                                              src1_views, src2_views, src3_views, dst_views,
                                              3, tindex) == 0)
       {
@@ -7500,6 +7577,8 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
       cache.tc_req_is_recv = new int[cache.max_reqs];
       cache.tc_completed = new int[cache.max_reqs];
     }
+    for (int i = 0; i < cpusize; i++)
+      cache.combined_src[i] = cache.combined_dst[i] = 0;
 
     MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
     for (int node = 0; node < cpusize; node++)
@@ -7561,6 +7640,8 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
       cache.tc_req_is_recv = new int[cache.max_reqs];
       cache.tc_completed = new int[cache.max_reqs];
     }
+    for (int i = 0; i < cpusize; i++)
+      cache.combined_src[i] = cache.combined_dst[i] = 0;
 
     MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
     for (int node = 0; node < cpusize; node++)
@@ -7613,6 +7694,8 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
       cache.tc_req_is_recv = new int[cache.max_reqs];
       cache.tc_completed = new int[cache.max_reqs];
     }
+    for (int i = 0; i < cpusize; i++)
+      cache.combined_src[i] = cache.combined_dst[i] = 0;
 
     MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
     for (int node = 0; node < cpusize; node++)
diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C
index 48c70bc..37ce797 100644
--- a/AMSS_NCKU_source/bssnEScalar_class.C
+++ b/AMSS_NCKU_source/bssnEScalar_class.C
@@ -24,16 +24,165 @@ using namespace std;
 #include "sommerfeld_rout.h"
 #include "getnp4.h"
 #include "shellfunctions.h"
-#include "parameters.h"
+#include "parameters.h"
+#if USE_CUDA_BSSN
+#include "bssn_rhs_cuda.h"
+#endif
 
 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
 #endif
 
-//================================================================================================
-
-// Define bssnEScalar_class
+//================================================================================================
+
+namespace
+{
+#if USE_CUDA_BSSN
+bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
+                                  double **host_views,
+                                  double *propspeeds = 0,
+                                  double *soa_flat = 0)
+{
+  int idx = 0;
+  while (vars && idx < BSSN_ESCALAR_CUDA_STATE_COUNT)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    if (propspeeds)
+      propspeeds[idx] = vars->data->propspeed;
+    if (soa_flat)
+    {
+      soa_flat[3 * idx + 0] = vars->data->SoA[0];
+      soa_flat[3 * idx + 1] = vars->data->SoA[1];
+      soa_flat[3 * idx + 2] = vars->data->SoA[2];
+    }
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == BSSN_ESCALAR_CUDA_STATE_COUNT && vars == 0;
+}
+
+bool bssn_escalar_cuda_use_resident_sync(int lev)
+{
+#ifdef WithShell
+  (void)lev;
+  return false;
+#else
+  return true;
+#endif
+}
+
+bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
+{
+  static int keep_all_levels = -1;
+  if (keep_all_levels < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS");
+    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  if (!enabled)
+    return false;
+  if (lev == analysis_lev)
+    return false;
+  if (keep_all_levels)
+    return true;
+  return lev < trfls_in;
+}
+
+bool bssn_escalar_sync_merged_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_ESCALAR_SYNC_MERGED");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+void bssn_escalar_sync_level(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
+{
+  if (bssn_escalar_sync_merged_enabled())
+    Parallel::Sync_merged(PatL, VarList, Symmetry);
+  else
+    Parallel::Sync(PatL, VarList, Symmetry);
+}
+
+bool bssn_escalar_timing_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_ESCALAR_STEP_TIMING");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
+                                double sync, double bh, double analysis, double swap,
+                                double resident, double rp)
+{
+  if (!bssn_escalar_timing_enabled())
+    return;
+
+  double local[8] = {total, rhs, sync, bh, analysis, swap, resident, rp};
+  double maxv[8] = {};
+  MPI_Reduce(local, maxv, 8, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+  if (myrank == 0)
+    fprintf(stderr,
+            "[AMSS-ESCALAR-STEP] lev=%d YN=%d total=%.6f rhs=%.6f sync=%.6f "
+            "bh=%.6f analysis=%.6f swap=%.6f resident=%.6f rp=%.6f other=%.6f\n",
+            lev, YN, maxv[0], maxv[1], maxv[2], maxv[3], maxv[4], maxv[5],
+            maxv[6], maxv[7],
+            maxv[0] - maxv[1] - maxv[2] - maxv[3] - maxv[4] - maxv[5] - maxv[6] - maxv[7]);
+}
+
+void bssn_escalar_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
+                                            int myrank, bool release_ctx)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
+      {
+        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+        if (!fill_bssn_escalar_cuda_views(cg, vars, state_out))
+        {
+          cout << "CUDA BSSN-EScalar resident state list mismatch during download" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out))
+        {
+          cout << "CUDA BSSN-EScalar resident state download failed" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (release_ctx)
+          bssn_cuda_release_step_ctx(cg);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+#endif
+}
+
+//================================================================================================
+
+// Define bssnEScalar_class
 
 // It inherits some members and methods from the parent class bssn_class and modifies others.
 // The modified members and methods are defined below (and in the header bssnEScalar_class.h).
@@ -177,11 +326,16 @@ void bssnEScalar_class::Initialize()
 
 //================================================================================================
 
-bssnEScalar_class::~bssnEScalar_class()
-{
-  delete Sphio;
-  delete Spio;
-  delete Sphi0;
+bssnEScalar_class::~bssnEScalar_class()
+{
+#if USE_CUDA_BSSN
+  for (int lev = 0; GH && lev < GH->levels; ++lev)
+    bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, true);
+#endif
+
+  delete Sphio;
+  delete Spio;
+  delete Sphi0;
   delete Spi0;
   delete Sphi;
   delete Spi;
@@ -707,7 +861,12 @@ void bssnEScalar_class::Read_Pablo()
 
 void bssnEScalar_class::Step(int lev, int YN)
 {
-  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
+#if USE_CUDA_BSSN
+  const bool use_cuda_resident_sync = bssn_escalar_cuda_use_resident_sync(lev);
+#else
+  const bool use_cuda_resident_sync = false;
+#endif
 #ifdef With_AHF
   AH_Step_Find(lev, dT_lev);
 #endif
@@ -716,13 +875,23 @@ void bssnEScalar_class::Step(int lev, int YN)
   if (lev < GH->movls)
     ndeps = numepsb;
   double TRK4 = PhysTime;
-  int iter_count = 0; // count RK4 substeps
-  int pre = 0, cor = 1;
-  int ERROR = 0;
-
-  MyList<ss_patch> *sPp;
-  // Predictor
-  MyList<Patch> *Pp = GH->PatL[lev];
+  int iter_count = 0; // count RK4 substeps
+  int pre = 0, cor = 1;
+  int ERROR = 0;
+  const bool escalar_step_timing = bssn_escalar_timing_enabled();
+  const double escalar_step_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+  double escalar_t_rhs = 0.0;
+  double escalar_t_sync = 0.0;
+  double escalar_t_bh = 0.0;
+  double escalar_t_analysis = 0.0;
+  double escalar_t_swap = 0.0;
+  double escalar_t_resident = 0.0;
+  double escalar_t_rp = 0.0;
+
+  MyList<ss_patch> *sPp;
+  // Predictor
+  double escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+  MyList<Patch> *Pp = GH->PatL[lev];
   while (Pp)
   {
     MyList<Block> *BP = Pp->data->blb;
@@ -731,15 +900,60 @@ void bssnEScalar_class::Step(int lev, int YN)
       Block *cg = BP->data;
       if (myrank == cg->rank)
       {
-#if (AGM == 0)
-        f_enforce_ga(cg->shape,
-                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
-                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
-#endif
-
-        if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+#if (AGM == 0)
+#if !USE_CUDA_BSSN
+        f_enforce_ga(cg->shape,
+                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
+                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
+#endif
+#endif
+
+        bool used_gpu_substep = false;
+#if USE_CUDA_BSSN
+        {
+          double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
+          double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+          double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
+          double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
+          if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
+              !fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_out))
+          {
+            cout << "CUDA BSSN-EScalar state list mismatch on predictor step" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          int apply_bam_bc = 0;
+          int apply_enforce_ga = 0;
+#if (AGM == 0)
+          apply_enforce_ga = 1;
+#endif
+#if (SommerType == 0)
+#ifndef WithShell
+          apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#endif
+          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
+          if (bssn_escalar_cuda_rk4_substep(cg,
+                                            cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                            state_in, state_out,
+                                            propspeed, soa_flat, Pp->data->bbox,
+                                            dT_lev, TRK4, iter_count, apply_bam_bc,
+                                            Symmetry, lev, ndeps, pre,
+                                            keep_resident_state, apply_enforce_ga, chitiny))
+          {
+            cout << "CUDA BSSN-EScalar predictor substep failed in domain: ("
+                 << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+            ERROR = 1;
+          }
+          used_gpu_substep = true;
+        }
+#endif
+
+        if (!used_gpu_substep &&
+            f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                        cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                        cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                        cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -783,9 +997,11 @@ void bssnEScalar_class::Step(int lev, int YN)
           ERROR = 1;
         }
 
-        // rk4 substep and boundary
-        {
-          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
+        if (!used_gpu_substep)
+        {
+        // rk4 substep and boundary
+        {
+          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
           while (varl0)
           {
 #ifndef WithShell
@@ -820,9 +1036,10 @@ void bssnEScalar_class::Step(int lev, int YN)
             varl = varl->next;
             varlrhs = varlrhs->next;
           }
-        }
-        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
-      }
+        }
+        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
+        }
+      }
       if (BP == Pp->data->ble)
         break;
       BP = BP->next;
@@ -834,19 +1051,21 @@ void bssnEScalar_class::Step(int lev, int YN)
     int erh = ERROR;
     MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
   }
-  if (ERROR)
-  {
+  if (ERROR)
+  {
     Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
     if (myrank == 0)
     {
       if (ErrorMonitor->outfile)
         ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime 
                               << ", lev = " << lev << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-#ifdef WithShell
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+  if (escalar_step_timing)
+    escalar_t_rhs += MPI_Wtime() - escalar_t0;
+
+#ifdef WithShell
   // evolve Shell Patches
   if (lev == 0)
   {
@@ -993,7 +1212,14 @@ void bssnEScalar_class::Step(int lev, int YN)
   }
 #endif
 
-  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+#if USE_CUDA_BSSN
+  bssn_escalar_sync_level(GH->PatL[lev], SynchList_pre, Symmetry);
+#else
+  Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
+#endif
+  if (escalar_step_timing)
+    escalar_t_sync += MPI_Wtime() - escalar_t0;
 
 #ifdef WithShell
   if (lev == 0)
@@ -1013,10 +1239,14 @@ void bssnEScalar_class::Step(int lev, int YN)
   }
 #endif
 
-  // for black hole position
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
+  // for black hole position
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+#if USE_CUDA_BSSN
+    (void)use_cuda_resident_sync;
+#endif
+    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
     for (int ithBH = 0; ithBH < BH_num; ithBH++)
     {
       f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
@@ -1041,19 +1271,29 @@ void bssnEScalar_class::Step(int lev, int YN)
         DG_List->insert(Sfy0);
         DG_List->insert(Sfz0);
         Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev);
-        DG_List->clearList();
-      }
-    }
-  }
+        DG_List->clearList();
+      }
+    }
+    if (escalar_step_timing)
+      escalar_t_bh += MPI_Wtime() - escalar_t0;
+  }
   // data analysis part
   // Warning NOTE: the variables1 are used as temp storege room
-  if (lev == a_lev)
-  {
-    AnalysisStuff_EScalar(lev, dT_lev);
-  }
-  // corrector
-  for (iter_count = 1; iter_count < 4; iter_count++)
-  {
+  if (lev == a_lev)
+  {
+    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+#if USE_CUDA_BSSN
+    if (use_cuda_resident_sync)
+      bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
+#endif
+    AnalysisStuff_EScalar(lev, dT_lev);
+    if (escalar_step_timing)
+      escalar_t_analysis += MPI_Wtime() - escalar_t0;
+  }
+  // corrector
+  for (iter_count = 1; iter_count < 4; iter_count++)
+  {
+    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
     // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
     if (iter_count == 1 || iter_count == 3)
       TRK4 += dT_lev / 2;
@@ -1066,22 +1306,67 @@ void bssnEScalar_class::Step(int lev, int YN)
         Block *cg = BP->data;
         if (myrank == cg->rank)
         {
-#if (AGM == 0)
-          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-                       cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-#elif (AGM == 1)
-          if (iter_count == 3)
-            f_enforce_ga(cg->shape,
+#if (AGM == 0)
+#if !USE_CUDA_BSSN
+          f_enforce_ga(cg->shape,
+                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+                       cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                       cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+#endif
+#elif (AGM == 1)
+          if (iter_count == 3)
+            f_enforce_ga(cg->shape,
                          cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                          cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                          cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                          cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-#endif
-
-          if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+#endif
+
+          bool used_gpu_substep = false;
+#if USE_CUDA_BSSN
+          {
+            double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
+            double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+            double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
+            double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
+            if (!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
+                !fill_bssn_escalar_cuda_views(cg, SynchList_cor, state_out))
+            {
+              cout << "CUDA BSSN-EScalar state list mismatch on corrector step" << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+            int apply_bam_bc = 0;
+            int apply_enforce_ga = 0;
+#if (AGM == 0)
+            apply_enforce_ga = 1;
+#endif
+#if (SommerType == 0)
+#ifndef WithShell
+            apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#endif
+            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
+            if (bssn_escalar_cuda_rk4_substep(cg,
+                                              cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                              state_in, state_out,
+                                              propspeed, soa_flat, Pp->data->bbox,
+                                              dT_lev, TRK4, iter_count, apply_bam_bc,
+                                              Symmetry, lev, ndeps, cor,
+                                              keep_resident_state, apply_enforce_ga, chitiny))
+            {
+              cout << "CUDA BSSN-EScalar corrector substep failed in domain: ("
+                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+              ERROR = 1;
+            }
+            used_gpu_substep = true;
+          }
+#endif
+
+          if (!used_gpu_substep &&
+              f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                          cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                          cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                          cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1125,9 +1410,11 @@ void bssnEScalar_class::Step(int lev, int YN)
                  << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
             ERROR = 1;
           }
-          // rk4 substep and boundary
-          {
-            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
+          if (!used_gpu_substep)
+          {
+          // rk4 substep and boundary
+          {
+            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
             // we do not check the correspondence here
             
             while (varl0)
@@ -1165,9 +1452,10 @@ void bssnEScalar_class::Step(int lev, int YN)
               varl1 = varl1->next;
               varlrhs = varlrhs->next;
             }
-          }
-          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
-        }
+          }
+          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
+          }
+        }
         if (BP == Pp->data->ble)
           break;
         BP = BP->next;
@@ -1180,8 +1468,8 @@ void bssnEScalar_class::Step(int lev, int YN)
       int erh = ERROR;
       MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
     }
-    if (ERROR)
-    {
+    if (ERROR)
+    {
       Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
       if (myrank == 0)
       {
@@ -1189,11 +1477,13 @@ void bssnEScalar_class::Step(int lev, int YN)
           ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count 
                                 << " variables at t = " << PhysTime 
                                 << ", lev = " << lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-#ifdef WithShell
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+    if (escalar_step_timing)
+      escalar_t_rhs += MPI_Wtime() - escalar_t0;
+
+#ifdef WithShell
     // evolve Shell Patches
     if (lev == 0)
     {
@@ -1349,7 +1639,14 @@ void bssnEScalar_class::Step(int lev, int YN)
     }
 #endif
 
-    Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
+    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+#if USE_CUDA_BSSN
+    bssn_escalar_sync_level(GH->PatL[lev], SynchList_cor, Symmetry);
+#else
+    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+#endif
+    if (escalar_step_timing)
+      escalar_t_sync += MPI_Wtime() - escalar_t0;
 
 #ifdef WithShell
     if (lev == 0)
@@ -1368,10 +1665,14 @@ void bssnEScalar_class::Step(int lev, int YN)
       }
     }
 #endif
-    // for black hole position
-    if (BH_num > 0 && lev == GH->levels - 1)
-    {
-      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
+    // for black hole position
+    if (BH_num > 0 && lev == GH->levels - 1)
+    {
+      escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+#if USE_CUDA_BSSN
+      (void)use_cuda_resident_sync;
+#endif
+      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
       for (int ithBH = 0; ithBH < BH_num; ithBH++)
       {
         f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
@@ -1396,14 +1697,17 @@ void bssnEScalar_class::Step(int lev, int YN)
           DG_List->insert(Sfy0);
           DG_List->insert(Sfz0);
           Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev);
-          DG_List->clearList();
-        }
-      }
-    }
-    // swap time level
-    if (iter_count < 3)
-    {
-      Pp = GH->PatL[lev];
+          DG_List->clearList();
+        }
+      }
+      if (escalar_step_timing)
+        escalar_t_bh += MPI_Wtime() - escalar_t0;
+    }
+    // swap time level
+    if (iter_count < 3)
+    {
+      escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+      Pp = GH->PatL[lev];
       while (Pp)
       {
         MyList<Block> *BP = Pp->data->blb;
@@ -1444,16 +1748,32 @@ void bssnEScalar_class::Step(int lev, int YN)
           Porg[ithBH][0] = Porg1[ithBH][0];
           Porg[ithBH][1] = Porg1[ithBH][1];
           Porg[ithBH][2] = Porg1[ithBH][2];
-        }
-      }
-    }
-  }
-
-#if (RPS == 0)
-  // mesh refinement boundary part
-  RestrictProlong(lev, YN, BB);
-
-#ifdef WithShell
+        }
+      }
+      if (escalar_step_timing)
+        escalar_t_swap += MPI_Wtime() - escalar_t0;
+    }
+  }
+
+#if USE_CUDA_BSSN
+  if (use_cuda_resident_sync)
+  {
+    escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+    if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
+      bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
+    if (escalar_step_timing)
+      escalar_t_resident += MPI_Wtime() - escalar_t0;
+  }
+#endif
+
+#if (RPS == 0)
+  // mesh refinement boundary part
+  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+  RestrictProlong(lev, YN, BB);
+  if (escalar_step_timing)
+    escalar_t_rp += MPI_Wtime() - escalar_t0;
+
+#ifdef WithShell
   if (lev == 0)
   {
     clock_t prev_clock, curr_clock;
@@ -1477,8 +1797,9 @@ void bssnEScalar_class::Step(int lev, int YN)
   // StateList     0   -----------
   //
   // OldStateList  old -----------
-  // update
-  Pp = GH->PatL[lev];
+  // update
+  escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
+  Pp = GH->PatL[lev];
   while (Pp)
   {
     MyList<Block> *BP = Pp->data->blb;
@@ -1520,10 +1841,18 @@ void bssnEScalar_class::Step(int lev, int YN)
     {
       Porg0[ithBH][0] = Porg1[ithBH][0];
       Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-}
+      Porg0[ithBH][2] = Porg1[ithBH][2];
+    }
+  }
+  if (escalar_step_timing)
+  {
+    escalar_t_swap += MPI_Wtime() - escalar_t0;
+    bssn_escalar_timing_report(myrank, lev, YN, MPI_Wtime() - escalar_step_t0,
+                               escalar_t_rhs, escalar_t_sync, escalar_t_bh,
+                               escalar_t_analysis, escalar_t_swap,
+                               escalar_t_resident, escalar_t_rp);
+  }
+}
 
 //================================================================================================
 
@@ -2074,14 +2403,44 @@ void bssnEScalar_class::Constraint_Out()
           MyList<Block> *BP = Pp->data->blb;
           while (BP)
           {
-            Block *cg = BP->data;
-            if (myrank == cg->rank)
-            {
-              if (lev > 0)
-                f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                                           cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+            Block *cg = BP->data;
+            if (myrank == cg->rank)
+            {
+              bool used_cuda_constraints = false;
+#if USE_CUDA_BSSN
+              {
+                double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
+                if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in))
+                {
+                  cout << "CUDA BSSN-EScalar constraint state list mismatch" << endl;
+                  MPI_Abort(MPI_COMM_WORLD, 1);
+                }
+                double *constraint_out[8] = {
+                    cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
+                    cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                    cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
+                    cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_fR->sgfn]};
+                int lev_arg = lev;
+                int sym_arg = Symmetry;
+                double eps_arg = ndeps;
+                if (bssn_escalar_cuda_compute_constraints(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                                          state_in, constraint_out,
+                                                          sym_arg, lev_arg, eps_arg))
+                {
+                  cout << "CUDA BSSN-EScalar constraint compute failed in domain: ("
+                       << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                       << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                       << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+                  MPI_Abort(MPI_COMM_WORLD, 1);
+                }
+                used_cuda_constraints = true;
+              }
+#endif
+              if (!used_cuda_constraints && lev > 0)
+                f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                           cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                           cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                                           cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                            cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                            cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                            cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
@@ -2110,15 +2469,16 @@ void bssnEScalar_class::Constraint_Out()
                                            cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                                            cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                                            cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                           cg->fgfs[Cons_Ham->sgfn],
-                                           cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                           cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                           Symmetry, lev, ndeps, pre);
-              f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
-                                      cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], 
-                                      cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn],
-                                      cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                                      cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                                           cg->fgfs[Cons_Ham->sgfn],
+                                           cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                           cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                           Symmetry, lev, ndeps, pre);
+              if (!used_cuda_constraints)
+                f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                      cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], 
+                                      cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn],
+                                      cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                                      cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                       cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                       cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
                                       cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index 04576c3..123a175 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -70,6 +70,125 @@ int amss_analysis_map_every()
   return every;
 }
 
+bool amss_rp_timing_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_RP_TIMING");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool amss_rp_detail_timing_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_RP_DETAIL_TIMING");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool amss_env_flag_enabled(const char *name)
+{
+  const char *env = getenv(name);
+  return env && atoi(env) != 0;
+}
+
+bool amss_cached_rp_restrict_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+    enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0;
+  return enabled != 0;
+}
+
+bool amss_cached_rp_outbd_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+    enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0;
+  return enabled != 0;
+}
+
+bool amss_cached_rp_fine_sync_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+    enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0;
+  return enabled != 0;
+}
+
+bool amss_cached_rp_coarse_sync_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+    enabled = amss_env_flag_enabled("AMSS_RP_CACHED_COARSE_SYNC") ? 1 : 0;
+  return enabled != 0;
+}
+
+bool amss_rp_skip_coarse_sync_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+    enabled = amss_env_flag_enabled("AMSS_RP_SKIP_COARSE_SYNC") ? 1 : 0;
+  return enabled != 0;
+}
+
+bool amss_evolve_timing_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+    enabled = amss_env_flag_enabled("AMSS_EVOLVE_TIMING") ? 1 : 0;
+  return enabled != 0;
+}
+
+struct AmssEvolveTimingStats
+{
+  double step;
+  double rp;
+  double regrid;
+  double constraint;
+};
+
+AmssEvolveTimingStats &amss_evolve_timing_stats()
+{
+  static AmssEvolveTimingStats stats = {};
+  return stats;
+}
+
+void amss_evolve_timing_reset()
+{
+  AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
+  stats.step = 0.0;
+  stats.rp = 0.0;
+  stats.regrid = 0.0;
+  stats.constraint = 0.0;
+}
+
+void amss_evolve_timing_add_step(double sec)
+{
+  amss_evolve_timing_stats().step += sec;
+}
+
+void amss_evolve_timing_add_rp(double sec)
+{
+  amss_evolve_timing_stats().rp += sec;
+}
+
+void amss_evolve_timing_add_regrid(double sec)
+{
+  amss_evolve_timing_stats().regrid += sec;
+}
+
+void amss_evolve_timing_add_constraint(double sec)
+{
+  amss_evolve_timing_stats().constraint += sec;
+}
+
 }
 
 // Compile-time switch for per-timestep memory usage collection/printing.
@@ -288,6 +407,37 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
   return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
 }
 
+int count_bssn_cuda_state_list(MyList<var> *vars)
+{
+  int count = 0;
+  while (vars)
+  {
+    ++count;
+    vars = vars->next;
+    if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
+      return -1;
+  }
+  return count;
+}
+
+bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
+                                int state_count,
+                                double **host_views)
+{
+  if (!cg || !host_views ||
+      (state_count != BSSN_CUDA_STATE_COUNT &&
+       state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
+    return false;
+  int idx = 0;
+  while (vars && idx < state_count)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == state_count && vars == 0;
+}
+
 bool bssn_cuda_use_resident_sync(int lev)
 {
 #ifdef WithShell
@@ -467,6 +617,11 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
               block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
           {
             var *vars[3] = {forx, fory, forz};
+            double *bh_host_key[3] = {
+              block->fgfs[forx->sgfn],
+              block->fgfs[fory->sgfn],
+              block->fgfs[forz->sgfn]
+            };
             double soa3[9];
             for (int f = 0; f < 3; f++)
             {
@@ -482,6 +637,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
                                               DH[0], DH[1], DH[2],
                                               x, y, z,
                                               interp_ordn, interp_sym,
+                                              bh_host_key,
                                               soa3, shellf) != 0)
             {
               const int sx = ordn;
@@ -552,6 +708,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
 
 void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
 {
+  const int state_count = count_bssn_cuda_state_list(vars);
   MyList<Patch> *Pp = PatL;
   while (Pp)
   {
@@ -561,13 +718,16 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
       Block *cg = BP->data;
       if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
       {
-        double *state_out[BSSN_CUDA_STATE_COUNT];
-        if (!fill_bssn_cuda_views(cg, vars, state_out))
+        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+        if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
         {
           cout << "CUDA BSSN state list mismatch on resident state download" << endl;
           MPI_Abort(MPI_COMM_WORLD, 1);
         }
-        if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
+        const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
+                           ? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
+                           : bssn_cuda_download_resident_state(cg, cg->shape, state_out);
+        if (rc)
         {
           cout << "CUDA resident state download failed" << endl;
           MPI_Abort(MPI_COMM_WORLD, 1);
@@ -585,6 +745,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
 
 void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
 {
+  const int state_count = count_bssn_cuda_state_list(vars);
   MyList<Patch> *Pp = PatL;
   while (Pp)
   {
@@ -594,13 +755,13 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
       Block *cg = BP->data;
       if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
       {
-        double *state_out[BSSN_CUDA_STATE_COUNT];
-        if (!fill_bssn_cuda_views(cg, vars, state_out))
+        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+        if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
         {
           cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
           MPI_Abort(MPI_COMM_WORLD, 1);
         }
-        if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out))
+        if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count))
         {
           cout << "CUDA resident state conditional download failed" << endl;
           MPI_Abort(MPI_COMM_WORLD, 1);
@@ -2890,6 +3051,10 @@ void bssn_class::Evolve(int Steps)
 
   for (int ncount = 1; ncount < Steps + 1; ncount++)
   {
+    const bool evolve_timing = amss_evolve_timing_enabled();
+    const double evolve_t0 = evolve_timing ? MPI_Wtime() : 0.0;
+    if (evolve_timing)
+      amss_evolve_timing_reset();
     cuda_level0_constraint_cache_valid = false;
 #if BSSN_FINE_TIMING
     step_timing::reset();
@@ -2918,9 +3083,12 @@ void bssn_class::Evolve(int Steps)
 
     // misc::tillherecheck("before Constraint_Out");
 
+    const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
     STEP_TIMER_DECL(timer_constraint_out);
     Constraint_Out(); // this will affect the Dump_List
     STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
+    if (evolve_timing)
+      amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
 
     LastDump += dT_mon;
     Last2dDump += dT_mon;
@@ -3093,6 +3261,22 @@ void bssn_class::Evolve(int Steps)
     if (ncount % BSSN_FINE_TIMING_EVERY == 0)
       rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start);
 #endif
+    if (evolve_timing)
+    {
+      const AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
+      const double local[4] = {stats.step, stats.rp, stats.regrid, stats.constraint};
+      double maxv[4] = {};
+      MPI_Reduce((void *)local, maxv, 4, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+      if (myrank == 0)
+      {
+        const double wall = MPI_Wtime() - evolve_t0;
+        const double known = maxv[0] + maxv[1] + maxv[2] + maxv[3];
+        fprintf(stderr,
+                "[AMSS-EVOLVE-TIMING] step=%d wall=%.6f step_fn=%.6f rp=%.6f "
+                "regrid=%.6f constraint=%.6f other=%.6f\n",
+                ncount, wall, maxv[0], maxv[1], maxv[2], maxv[3], wall - known);
+      }
+    }
   }
   /*
   #ifdef With_AHF
@@ -3162,7 +3346,11 @@ void bssn_class::RecursiveStep(int lev)
   {
     //     if(myrank==0) cout<<"level now = "<<lev<<" NoIteration = "<<i<<endl;
     YN = (i == NoIterations - 1) ? 1 : 0; // 1: same time level for coarse level and fine level
+    const bool evolve_timing = amss_evolve_timing_enabled();
+    const double step_t0 = evolve_timing ? MPI_Wtime() : 0.0;
     Step(lev, YN);
+    if (evolve_timing)
+      amss_evolve_timing_add_step(MPI_Wtime() - step_t0);
 
 #if (AGM == 2)
     if (GH->levels == 1)
@@ -3195,7 +3383,10 @@ void bssn_class::RecursiveStep(int lev)
     //
     // till here the PhysTime has updated dT_lev
     //  if(myrank==0) cout<<"level now = "<<lev<<", "<<fgt(PhysTime-dT_lev,StartTime,dT_lev/2)<<endl;
+    const double rp_t0 = evolve_timing ? MPI_Wtime() : 0.0;
     RestrictProlong(lev, YN, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), StateList, OldStateList, SynchList_cor);
+    if (evolve_timing)
+      amss_evolve_timing_add_rp(MPI_Wtime() - rp_t0);
     // RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);
 
 #ifdef WithShell
@@ -3224,6 +3415,8 @@ void bssn_class::RecursiveStep(int lev)
 #endif
 
 #if (REGLEV == 0)
+  const bool evolve_timing = amss_evolve_timing_enabled();
+  const double regrid_t0 = evolve_timing ? MPI_Wtime() : 0.0;
   STEP_TIMER_DECL(timer_regrid_onelevel);
 #if USE_CUDA_BSSN
   if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0))
@@ -3242,6 +3435,8 @@ void bssn_class::RecursiveStep(int lev)
   for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
   }
+  if (evolve_timing)
+    amss_evolve_timing_add_regrid(MPI_Wtime() - regrid_t0);
   STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
 #endif
 }
@@ -6847,6 +7042,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //
 // SynchList_cor  old -----------
 {
+  const bool rp_runtime_timing = amss_rp_timing_enabled();
+  const double rp_runtime_start = rp_runtime_timing ? MPI_Wtime() : 0.0;
+  const bool rp_detail_timing = amss_rp_detail_timing_enabled();
+  double rp_t_prepare = 0.0;
+  double rp_t_restrict = 0.0;
+  double rp_t_coarse_sync = 0.0;
+  double rp_t_outbd = 0.0;
+  double rp_t_fine_sync = 0.0;
+  double rp_t0 = 0.0;
   STEP_TIMER_DECL(timer_restrict_prolong);
 #if (PSTR == 1 || PSTR == 2)
 //  stringstream a_stream;
@@ -6858,6 +7062,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
     MyList<Patch> *Pp, *Ppc;
     if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level
     {
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       Pp = GH->PatL[lev - 1];
       while (Pp)
       {
@@ -6873,6 +7078,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
         Pp = Pp->next;
       }
+      if (rp_detail_timing) rp_t_prepare += MPI_Wtime() - rp_t0;
 
 #if (PSTR == 1 || PSTR == 2)
 //       Pp=GH->PatL[lev];
@@ -6889,14 +7095,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 
 #if (RPB == 0)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
 #if (ABEtype == 1 || ABEtype == 2)
       Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #else
       Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #endif
+      if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
 #elif (RPB == 1)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
       Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
+      if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
 #endif
 
 #if (PSTR == 1 || PSTR == 2)
@@ -6907,10 +7117,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 
 #if (ABEtype == 1 || ABEtype == 2)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
+      if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
 #endif
 #endif
 
@@ -6922,6 +7136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 
 #if (RPB == 0)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
 #if (MIXOUTB == 0)
 #if (ABEtype == 1 || ABEtype == 2)
       Ppc = GH->PatL[lev - 1];
@@ -6941,9 +7156,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #elif (MIXOUTB == 1)
       Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
+      if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
 #elif (RPB == 1)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
       Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
+      if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
 #endif
 
 #if (PSTR == 1 || PSTR == 2)
@@ -6964,14 +7182,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 
 #if (RPB == 0)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
 #if (ABEtype == 1 || ABEtype == 2)
       Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #else
       Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
 #endif
+      if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
 #elif (RPB == 1)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
       Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
+      if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
 #endif
 
 #if (PSTR == 1 || PSTR == 2)
@@ -6982,10 +7204,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 
 #if (ABEtype == 1 || ABEtype == 2)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
+      if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
+      if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
 #endif
 #endif
 
@@ -6997,6 +7223,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif
 
 #if (RPB == 0)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
 #if (MIXOUTB == 0)
 #if (ABEtype == 1 || ABEtype == 2)
       Ppc = GH->PatL[lev - 1];
@@ -7016,9 +7243,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #elif (MIXOUTB == 1)
       Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
+      if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
 #elif (RPB == 1)
+      if (rp_detail_timing) rp_t0 = MPI_Wtime();
       //       Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
       Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
+      if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
 #endif
 
 #if (PSTR == 1 || PSTR == 2)
@@ -7030,9 +7260,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
     }
 
 #if (ABEtype == 1 || ABEtype == 2)
+    if (rp_detail_timing) rp_t0 = MPI_Wtime();
     Parallel::Sync(GH->PatL[lev], SL, Symmetry);
+    if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
 #else
+    if (rp_detail_timing) rp_t0 = MPI_Wtime();
     Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
+    if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
 #endif
 
 #if (PSTR == 1 || PSTR == 2)
@@ -7042,6 +7276,27 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 //    misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
 #endif
   }
+  if (rp_runtime_timing)
+  {
+    const double local_sec = MPI_Wtime() - rp_runtime_start;
+    double max_sec = 0.0;
+    MPI_Reduce((void *)&local_sec, &max_sec, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    if (myrank == 0)
+      fprintf(stderr, "[AMSS-RP-TIMING] lev=%d YN=%d BB=%d sec=%.6f\n",
+              lev, YN, BB ? 1 : 0, max_sec);
+  }
+  if (rp_detail_timing)
+  {
+    double local_detail[5] = {rp_t_prepare, rp_t_restrict, rp_t_coarse_sync, rp_t_outbd, rp_t_fine_sync};
+    double max_detail[5] = {};
+    MPI_Reduce(local_detail, max_detail, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    if (myrank == 0)
+      fprintf(stderr,
+              "[AMSS-RP-DETAIL] lev=%d YN=%d BB=%d prepare=%.6f restrict=%.6f "
+              "coarse_sync=%.6f outbd=%.6f fine_sync=%.6f\n",
+              lev, YN, BB ? 1 : 0, max_detail[0], max_detail[1],
+              max_detail[2], max_detail[3], max_detail[4]);
+  }
   STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
 }
 
@@ -7229,7 +7484,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 
 #if (RPB == 0)
 #if (ABEtype == 1 || ABEtype == 2)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
+      if (amss_cached_rp_restrict_enabled())
+        Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      else
+        Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
 #else
       Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
 #endif
@@ -7239,7 +7497,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #endif
 
 #if (ABEtype == 1 || ABEtype == 2)
-      Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
+      if (amss_rp_skip_coarse_sync_enabled())
+      {
+      }
+      else if (amss_cached_rp_coarse_sync_enabled())
+        Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
+      else
+        Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
       Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
@@ -7249,16 +7513,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
 #if (ABEtype == 1 || ABEtype == 2)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
+      if (amss_cached_rp_outbd_enabled())
       {
-        Pp = GH->PatL[lev];
-        while (Pp)
+        Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      }
+      else
+      {
+        Ppc = GH->PatL[lev - 1];
+        while (Ppc)
         {
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
-          Pp = Pp->next;
+          Pp = GH->PatL[lev];
+          while (Pp)
+          {
+            Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry);
+            Pp = Pp->next;
+          }
+          Ppc = Ppc->next;
         }
-        Ppc = Ppc->next;
       }
 #else
       Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
@@ -7277,7 +7548,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
         cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
 #if (ABEtype == 1 || ABEtype == 2)
-      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
+      if (amss_cached_rp_restrict_enabled())
+        Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
+      else
+        Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #else
       Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
 #endif
@@ -7287,7 +7561,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #endif
 
 #if (ABEtype == 1 || ABEtype == 2)
-      Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
+      if (amss_rp_skip_coarse_sync_enabled())
+      {
+      }
+      else if (amss_cached_rp_coarse_sync_enabled())
+        Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
+      else
+        Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
 #else
 #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
       Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
@@ -7297,16 +7577,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
 #if (RPB == 0)
 #if (MIXOUTB == 0)
 #if (ABEtype == 1 || ABEtype == 2)
-      Ppc = GH->PatL[lev - 1];
-      while (Ppc)
+      if (amss_cached_rp_outbd_enabled())
       {
-        Pp = GH->PatL[lev];
-        while (Pp)
+        Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      }
+      else
+      {
+        Ppc = GH->PatL[lev - 1];
+        while (Ppc)
         {
-          Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
-          Pp = Pp->next;
+          Pp = GH->PatL[lev];
+          while (Pp)
+          {
+            Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry);
+            Pp = Pp->next;
+          }
+          Ppc = Ppc->next;
         }
-        Ppc = Ppc->next;
       }
 #else
       Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
@@ -7321,7 +7608,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
     }
 
 #if (ABEtype == 1 || ABEtype == 2)
-    Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
+    if (amss_cached_rp_fine_sync_enabled())
+      Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
+    else
+      Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
 #else
     Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
 #endif
diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h
index f799dc0..d390bd1 100644
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -144,7 +144,7 @@ public:
        bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
                   int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
                   int a_levi, int maxli, int decni, double maxrexi, double drexi);
-       ~bssn_class();
+       virtual ~bssn_class();
 
        void Evolve(int Steps);
        void RecursiveStep(int lev);
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 378c79e..da0ccd3 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -17,6 +17,18 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 
+extern "C" {
+#ifdef fortran1
+void set_escalar_parameter(double &, double &, double &, double &, double &);
+#endif
+#ifdef fortran2
+void SET_ESCALAR_PARAMETER(double &, double &, double &, double &, double &);
+#endif
+#ifdef fortran3
+void set_escalar_parameter_(double &, double &, double &, double &, double &);
+#endif
+}
+
 /* ------------------------------------------------------------------ */
 /*  Multi-GPU dispatch: distribute ranks across available GPUs          */
 /* ------------------------------------------------------------------ */
@@ -401,7 +413,7 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
  */
 
 /* Total number of "all"-sized slots */
-#define NUM_SLOTS 160
+#define NUM_SLOTS 192
 
 struct GpuBuffers {
     double *d_mem;          /* single big allocation */
@@ -468,6 +480,12 @@ enum {
     S_Gamxa, S_Gamya, S_Gamza,
     S_gupxx, S_gupxy, S_gupxz,
     S_gupyy, S_gupyz, S_gupzz,
+    S_Sphi, S_Spi, S_Sphi_rhs, S_Spi_rhs,
+    S_Sphi0, S_Spi0, S_Sphi_accum, S_Spi_accum,
+    S_Sphi_next, S_Spi_next,
+    S_Sphi_x, S_Sphi_y, S_Sphi_z,
+    S_Sphi_xx, S_Sphi_xy, S_Sphi_xz, S_Sphi_yy, S_Sphi_yz, S_Sphi_zz,
+    S_trK_x, S_trK_y, S_trK_z,
     NUM_USED_SLOTS
 };
 
@@ -485,6 +503,8 @@ static constexpr int BSSN_STATE_COUNT = 24;
 static constexpr int BSSN_MATTER_COUNT = 10;
 static constexpr int BSSN_LK_FIELD_COUNT = 24;
 static constexpr int BSSN_RESIDENT_BANK_COUNT = 4;
+static constexpr int BSSN_ESCALAR_STATE_COUNT = 26;
+static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT;
 
 static const int k_state_input_slots[BSSN_STATE_COUNT] = {
     S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
@@ -503,6 +523,25 @@ static const int k_state_rhs_slots[BSSN_STATE_COUNT] = {
     S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs
 };
 
+static const int k_escalar_state_input_slots[BSSN_ESCALAR_STATE_COUNT] = {
+    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
+    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
+    S_Gamx, S_Gamy, S_Gamz,
+    S_Lap, S_betax, S_betay, S_betaz,
+    S_dtSfx, S_dtSfy, S_dtSfz,
+    S_Sphi, S_Spi
+};
+
+static const int k_escalar_state_rhs_slots[BSSN_ESCALAR_STATE_COUNT] = {
+    S_chi_rhs, S_trK_rhs,
+    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
+    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
+    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
+    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
+    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
+    S_Sphi_rhs, S_Spi_rhs
+};
+
 static const int k_matter_slots[BSSN_MATTER_COUNT] = {
     S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
 };
@@ -526,8 +565,8 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
     S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
 };
 
-__constant__ int d_subset_state_indices[BSSN_STATE_COUNT];
-__constant__ double d_comm_state_soa[3 * BSSN_STATE_COUNT];
+__constant__ int d_subset_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
+__constant__ double d_comm_state_soa[3 * BSSN_RESIDENT_STATE_CAPACITY];
 
 static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
      1,  1,  1,
@@ -565,13 +604,13 @@ struct StepContext {
     double *d_matter_mem;
     double *d_comm_mem;
     double *h_comm_mem;
-    std::array<double *, BSSN_STATE_COUNT> d_state0;
-    std::array<double *, BSSN_STATE_COUNT> d_accum;
-    std::array<double *, BSSN_STATE_COUNT> d_state_curr;
-    std::array<double *, BSSN_STATE_COUNT> d_state_next;
-    std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
-    std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> resident_host;
-    std::array<std::array<unsigned char, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> resident_host_clean;
+    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state0;
+    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_accum;
+    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state_curr;
+    std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state_next;
+    std::array<std::array<double *, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> d_resident;
+    std::array<std::array<double *, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> resident_host;
+    std::array<std::array<unsigned char, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> resident_host_clean;
     std::array<unsigned long long, BSSN_RESIDENT_BANK_COUNT> resident_age;
     std::array<bool, BSSN_RESIDENT_BANK_COUNT> resident_valid;
     std::array<double *, BSSN_MATTER_COUNT> d_matter;
@@ -802,18 +841,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
 
         StepAllocation alloc = acquire_step_allocation(all);
         if (!has_step_allocation(alloc)) {
-            CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double)));
-            CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
             for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
                 CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b],
-                                      BSSN_STATE_COUNT * all * sizeof(double)));
+                                      BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
             }
             CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
             alloc.cap_all = all;
         }
         attach_step_allocation(ctx, alloc);
     }
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) {
         ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
         ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
         for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -897,14 +936,14 @@ static int *ensure_comm_segment_meta_buffer(size_t needed_ints)
 
 static void upload_comm_state_soa(const double *state_soa, int state_count)
 {
-    double soa[3 * BSSN_STATE_COUNT];
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    double soa[3 * BSSN_RESIDENT_STATE_CAPACITY];
+    for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) {
         soa[3 * i + 0] = 1.0;
         soa[3 * i + 1] = 1.0;
         soa[3 * i + 2] = 1.0;
     }
     if (state_soa) {
-        const int n = (state_count < BSSN_STATE_COUNT) ? state_count : BSSN_STATE_COUNT;
+        const int n = (state_count < BSSN_RESIDENT_STATE_CAPACITY) ? state_count : BSSN_RESIDENT_STATE_CAPACITY;
         for (int i = 0; i < n; ++i) {
             soa[3 * i + 0] = state_soa[3 * i + 0];
             soa[3 * i + 1] = state_soa[3 * i + 1];
@@ -2535,6 +2574,138 @@ static void gpu_lopsided_kodis_state_batch(double eps_val, int all)
         g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
 }
 
+__global__ __launch_bounds__(128, 4)
+void kern_escalar_sources(
+    const double * __restrict__ Sphi,
+    const double * __restrict__ Spi,
+    const double * __restrict__ alpn1,
+    const double * __restrict__ chin1,
+    const double * __restrict__ gxx,
+    const double * __restrict__ gxy,
+    const double * __restrict__ gxz,
+    const double * __restrict__ gyy,
+    const double * __restrict__ gyz,
+    const double * __restrict__ gzz,
+    const double * __restrict__ gupxx,
+    const double * __restrict__ gupxy,
+    const double * __restrict__ gupxz,
+    const double * __restrict__ gupyy,
+    const double * __restrict__ gupyz,
+    const double * __restrict__ gupzz,
+    const double * __restrict__ chix,
+    const double * __restrict__ chiy,
+    const double * __restrict__ chiz,
+    const double * __restrict__ Lapx,
+    const double * __restrict__ Lapy,
+    const double * __restrict__ Lapz,
+    const double * __restrict__ trK,
+    const double * __restrict__ Gamx,
+    const double * __restrict__ Gamy,
+    const double * __restrict__ Gamz,
+    const double * __restrict__ Kx,
+    const double * __restrict__ Ky,
+    const double * __restrict__ Kz,
+    const double * __restrict__ fxx,
+    const double * __restrict__ fxy,
+    const double * __restrict__ fxz,
+    const double * __restrict__ fyy,
+    const double * __restrict__ fyz,
+    const double * __restrict__ fzz,
+    double * __restrict__ Sphi_rhs,
+    double * __restrict__ Spi_rhs,
+    double * __restrict__ rho,
+    double * __restrict__ Sx,
+    double * __restrict__ Sy,
+    double * __restrict__ Sz,
+    double * __restrict__ Sxx,
+    double * __restrict__ Sxy,
+    double * __restrict__ Sxz,
+    double * __restrict__ Syy,
+    double * __restrict__ Syz,
+    double * __restrict__ Szz)
+{
+    constexpr double PI_V = 3.141592653589793238462643383279502884;
+    constexpr double TWO = 2.0;
+    constexpr double HALF = 0.5;
+    constexpr double A2 = 3.0;
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < d_gp.all;
+         i += blockDim.x * gridDim.x) {
+        const double c1 = chin1[i];
+        const double a = alpn1[i];
+        const double sx = Kx[i];
+        const double sy = Ky[i];
+        const double sz = Kz[i];
+        const double sp = Spi[i];
+
+        const double uxx = gupxx[i];
+        const double uxy = gupxy[i];
+        const double uxz = gupxz[i];
+        const double uyy = gupyy[i];
+        const double uyz = gupyz[i];
+        const double uzz = gupzz[i];
+
+        const double sqpi3 = sqrt(PI_V / 3.0);
+        const double e4 = exp(4.0 * sqpi3 * Sphi[i]);
+        const double em8 = exp(-8.0 * sqpi3 * Sphi[i]);
+        const double V = em8 * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_V * A2);
+        const double dV = (1.0 / A2 / 12.0) * sqrt(3.0 / PI_V) * em8 * (-1.0 + e4);
+
+        Sphi_rhs[i] = a * sp;
+
+        double pi_rhs = uxx * fxx[i] + uyy * fyy[i] + uzz * fzz[i]
+                      + TWO * (uxy * fxy[i] + uxz * fxz[i] + uyz * fyz[i]);
+        pi_rhs -= (Gamx[i] + (uxx * chix[i] + uxy * chiy[i] + uxz * chiz[i]) / (TWO * c1)) * sx
+                + (Gamy[i] + (uxy * chix[i] + uyy * chiy[i] + uyz * chiz[i]) / (TWO * c1)) * sy
+                + (Gamz[i] + (uxz * chix[i] + uyz * chiy[i] + uzz * chiz[i]) / (TWO * c1)) * sz;
+        pi_rhs = pi_rhs * a
+               + (uxx * Lapx[i] * sx + uxy * Lapx[i] * sy + uxz * Lapx[i] * sz
+               +  uxy * Lapy[i] * sx + uyy * Lapy[i] * sy + uyz * Lapy[i] * sz
+               +  uxz * Lapz[i] * sx + uyz * Lapz[i] * sy + uzz * Lapz[i] * sz);
+        Spi_rhs[i] = pi_rhs * c1 + a * (trK[i] * sp - dV);
+
+        const double grad = HALF * (uxx * sx * sx + uyy * sy * sy + uzz * sz * sz)
+                          + uxy * sx * sy + uxz * sx * sz + uyz * sy * sz;
+        const double rho_v = c1 * grad + HALF * sp * sp + V;
+        rho[i] = rho_v;
+        Sx[i] = -sp * sx;
+        Sy[i] = -sp * sy;
+        Sz[i] = -sp * sz;
+        const double f = (rho_v - sp * sp) / c1;
+        Sxx[i] = sx * sx - f * gxx[i];
+        Sxy[i] = sx * sy - f * gxy[i];
+        Sxz[i] = sx * sz - f * gxz[i];
+        Syy[i] = sy * sy - f * gyy[i];
+        Syz[i] = sy * sz - f * gyz[i];
+        Szz[i] = sz * sz - f * gzz[i];
+    }
+}
+
+static void gpu_escalar_sources(int all)
+{
+    #define D(s) g_buf.slot[s]
+    gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all);
+    gpu_fdderivs(D(S_Sphi), D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz),
+                 D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz), 1.0, 1.0, 1.0, all);
+
+    kern_escalar_sources<<<grid((size_t)all), BLK>>>(
+        D(S_Sphi), D(S_Spi),
+        D(S_alpn1), D(S_chin1),
+        D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
+        D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_chix), D(S_chiy), D(S_chiz),
+        D(S_Lapx), D(S_Lapy), D(S_Lapz),
+        D(S_trK), D(S_Gamx), D(S_Gamy), D(S_Gamz),
+        D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z),
+        D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz),
+        D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz),
+        D(S_Sphi_rhs), D(S_Spi_rhs),
+        D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
+        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
+    #undef D
+}
+
 __global__ void kern_rk4_finalize(const double * __restrict__ f0,
                                   double * __restrict__ frhs,
                                   double * __restrict__ accum,
@@ -2619,6 +2790,27 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx,
     kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
 }
 
+static void gpu_escalar_rk4_finalize_batch(const StepContext &ctx,
+                                           size_t all,
+                                           double dT,
+                                           int rk4_stage,
+                                           double chitiny)
+{
+    Rk4FinalizeTables tables = {};
+    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        tables.f0_fields[i] = ctx.d_state0[i];
+        tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
+        tables.accum_fields[i] = ctx.d_accum[i];
+    }
+
+    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
+    kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
+    kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[24], g_buf.slot[S_Sphi_rhs],
+                                          ctx.d_accum[24], dT, rk4_stage);
+    kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[25], g_buf.slot[S_Spi_rhs],
+                                          ctx.d_accum[25], dT, rk4_stage);
+}
+
 __global__ __launch_bounds__(128, 4)
 void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
                                       int touch_xmin, int touch_xmax,
@@ -2669,6 +2861,49 @@ static void gpu_copy_patch_boundary_batch(int all,
         touch_zmin, touch_zmax);
 }
 
+__global__ __launch_bounds__(128, 4)
+void kern_copy_patch_boundary_scalar(const double * __restrict__ src,
+                                     double * __restrict__ dst,
+                                     int touch_xmin, int touch_xmax,
+                                     int touch_ymin, int touch_ymax,
+                                     int touch_zmin, int touch_zmax)
+{
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid >= d_gp.all) return;
+
+    const int nx = d_gp.ex[0];
+    const int ny = d_gp.ex[1];
+    const int i0 = tid % nx;
+    const int j0 = (tid / nx) % ny;
+    const int k0 = tid / (nx * ny);
+    const bool on_boundary =
+        (touch_xmin && i0 == 0) ||
+        (touch_xmax && i0 == nx - 1) ||
+        (touch_ymin && j0 == 0) ||
+        (touch_ymax && j0 == ny - 1) ||
+        (touch_zmin && k0 == 0) ||
+        (touch_zmax && k0 == d_gp.ex[2] - 1);
+    if (on_boundary) dst[tid] = src[tid];
+}
+
+static void gpu_escalar_copy_patch_boundary_batch(int all,
+                                                  int touch_xmin, int touch_xmax,
+                                                  int touch_ymin, int touch_ymax,
+                                                  int touch_zmin, int touch_zmax)
+{
+    if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax))
+        return;
+    gpu_copy_patch_boundary_batch(all, touch_xmin, touch_xmax,
+                                  touch_ymin, touch_ymax,
+                                  touch_zmin, touch_zmax);
+    kern_copy_patch_boundary_scalar<<<grid((size_t)all), BLK>>>(
+        g_buf.slot[S_Sphi], g_buf.slot[S_Sphi_rhs],
+        touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
+    kern_copy_patch_boundary_scalar<<<grid((size_t)all), BLK>>>(
+        g_buf.slot[S_Spi], g_buf.slot[S_Spi_rhs],
+        touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
+}
+
 __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
                                      double * __restrict__ gxy,
                                      double * __restrict__ gxz,
@@ -4773,6 +5008,72 @@ void kern_phase18_constraints(
     }
 }
 
+__global__ __launch_bounds__(128, 4)
+void kern_escalar_constraint_fr(
+    const double* __restrict__ chin1,
+    const double* __restrict__ gupxx, const double* __restrict__ gupxy,
+    const double* __restrict__ gupxz, const double* __restrict__ gupyy,
+    const double* __restrict__ gupyz, const double* __restrict__ gupzz,
+    const double* __restrict__ trK,
+    const double* __restrict__ Axx, const double* __restrict__ Axy,
+    const double* __restrict__ Axz, const double* __restrict__ Ayy,
+    const double* __restrict__ Ayz, const double* __restrict__ Azz,
+    const double* __restrict__ Rxx, const double* __restrict__ Rxy,
+    const double* __restrict__ Rxz, const double* __restrict__ Ryy,
+    const double* __restrict__ Ryz, const double* __restrict__ Rzz,
+    const double* __restrict__ rho,
+    const double* __restrict__ Sxx, const double* __restrict__ Sxy,
+    const double* __restrict__ Sxz, const double* __restrict__ Syy,
+    const double* __restrict__ Syz, const double* __restrict__ Szz,
+    const double* __restrict__ Sphi,
+    double a2,
+    double* __restrict__ Cons_fR)
+{
+    const double TWO = 2.0;
+    const double F2o3 = 2.0 / 3.0;
+    const double F8 = 8.0;
+    const double PI_V = 3.14159265358979323846;
+    const double SQRT3OPI_OVER4 = 0.25 * sqrt(3.0 / PI_V);
+
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all;
+         i += blockDim.x * gridDim.x)
+    {
+        const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
+        const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
+        const double c1 = chin1[i];
+
+        const double AijAij =
+            uxx * (uxx * Axx[i] * Axx[i] + uyy * Axy[i] * Axy[i] + uzz * Axz[i] * Axz[i]
+                 + TWO * (uxy * Axx[i] * Axy[i] + uxz * Axx[i] * Axz[i] + uyz * Axy[i] * Axz[i]))
+          + uyy * (uxx * Axy[i] * Axy[i] + uyy * Ayy[i] * Ayy[i] + uzz * Ayz[i] * Ayz[i]
+                 + TWO * (uxy * Axy[i] * Ayy[i] + uxz * Axy[i] * Ayz[i] + uyz * Ayy[i] * Ayz[i]))
+          + uzz * (uxx * Axz[i] * Axz[i] + uyy * Ayz[i] * Ayz[i] + uzz * Azz[i] * Azz[i]
+                 + TWO * (uxy * Axz[i] * Ayz[i] + uxz * Axz[i] * Azz[i] + uyz * Ayz[i] * Azz[i]))
+          + TWO * (uxy * (uxx * Axx[i] * Axy[i] + uyy * Axy[i] * Ayy[i] + uzz * Axz[i] * Ayz[i]
+                         + uxy * (Axx[i] * Ayy[i] + Axy[i] * Axy[i])
+                         + uxz * (Axx[i] * Ayz[i] + Axz[i] * Axy[i])
+                         + uyz * (Axy[i] * Ayz[i] + Axz[i] * Ayy[i]))
+                 + uxz * (uxx * Axx[i] * Axz[i] + uyy * Axy[i] * Ayz[i] + uzz * Axz[i] * Azz[i]
+                         + uxy * (Axx[i] * Ayz[i] + Axy[i] * Axz[i])
+                         + uxz * (Axx[i] * Azz[i] + Axz[i] * Axz[i])
+                         + uyz * (Axy[i] * Azz[i] + Axz[i] * Ayz[i]))
+                 + uyz * (uxx * Axy[i] * Axz[i] + uyy * Ayy[i] * Ayz[i] + uzz * Ayz[i] * Azz[i]
+                         + uxy * (Axy[i] * Ayz[i] + Ayy[i] * Axz[i])
+                         + uxz * (Axy[i] * Azz[i] + Ayz[i] * Axz[i])
+                         + uyz * (Ayy[i] * Azz[i] + Ayz[i] * Ayz[i])));
+
+        const double R_sc = uxx * Rxx[i] + uyy * Ryy[i] + uzz * Rzz[i]
+                          + TWO * (uxy * Rxy[i] + uxz * Rxz[i] + uyz * Ryz[i]);
+        const double trS = uxx * Sxx[i] + uyy * Syy[i] + uzz * Szz[i]
+                         + TWO * (uxy * Sxy[i] + uxz * Sxz[i] + uyz * Syz[i]);
+        const double RR = AijAij - F2o3 * trK[i] * trK[i]
+                        - R_sc * c1
+                        - F8 * PI_V * (3.0 * rho[i] - trS * c1);
+        const double fprim = 1.0 + 2.0 * a2 * RR;
+        Cons_fR[i] = Sphi[i] - SQRT3OPI_OVER4 * log(fprim);
+    }
+}
+
 static void setup_grid_params(int *ex,
                               double *X, double *Y, double *Z,
                               int Symmetry, double eps, int co)
@@ -4933,50 +5234,109 @@ static void bind_matter_slots(const StepContext &ctx)
     }
 }
 
-static void bind_state_input_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
+static void bind_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
 {
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         g_buf.slot[k_state_input_slots[i]] = state[i];
     }
 }
 
-static void bind_state_output_slots(const std::array<double *, BSSN_STATE_COUNT> &state)
+static void bind_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
 {
     for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
         g_buf.slot[k_state_rhs_slots[i]] = state[i];
     }
 }
 
-static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
+static void bind_escalar_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
+{
+    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
+        g_buf.slot[k_escalar_state_input_slots[i]] = state[i];
+    }
+}
+
+static void bind_escalar_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
+{
+    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
+        g_buf.slot[k_escalar_state_rhs_slots[i]] = state[i];
+    }
+}
+
+static void upload_escalar_state_inputs(double **state_host, size_t all);
+
+static bool resident_key_matches_count(const StepContext &ctx, int bank, double **host_key, int state_count)
 {
     if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
         return false;
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
+        return false;
+    for (int i = 0; i < state_count; ++i) {
         if (!host_key[i] || ctx.resident_host[bank][i] != host_key[i])
             return false;
     }
     return true;
 }
 
-static int find_resident_bank(const StepContext &ctx, double **host_key)
+static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
+{
+    return resident_key_matches_count(ctx, bank, host_key, BSSN_STATE_COUNT);
+}
+
+static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count)
 {
     if (!host_key) return -1;
     for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
-        if (resident_key_matches(ctx, b, host_key))
+        if (resident_key_matches_count(ctx, b, host_key, state_count))
             return b;
     }
     return -1;
 }
 
-static bool resident_key_usable(double **host_key)
+static int find_resident_bank_subset(const StepContext &ctx,
+                                     double **host_key,
+                                     const int *state_indices,
+                                     int subset_count)
+{
+    if (!host_key || !state_indices || subset_count <= 0)
+        return -1;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        bool match = true;
+        for (int i = 0; i < subset_count; ++i) {
+            const int state_index = state_indices[i];
+            if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY ||
+                !host_key[i] ||
+                ctx.resident_host[b][state_index] != host_key[i]) {
+                match = false;
+                break;
+            }
+        }
+        if (match)
+            return b;
+    }
+    return -1;
+}
+
+static int find_resident_bank(const StepContext &ctx, double **host_key)
+{
+    return find_resident_bank_count(ctx, host_key, BSSN_STATE_COUNT);
+}
+
+static bool resident_key_usable_count(double **host_key, int state_count)
 {
     if (!host_key) return false;
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
+        return false;
+    for (int i = 0; i < state_count; ++i) {
         if (!host_key[i]) return false;
     }
     return true;
 }
 
+static bool resident_key_usable(double **host_key)
+{
+    return resident_key_usable_count(host_key, BSSN_STATE_COUNT);
+}
+
 static void set_resident_host_clean(StepContext &ctx, int bank, bool clean)
 {
     if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
@@ -4991,7 +5351,7 @@ static bool resident_host_subset_clean(const StepContext &ctx,
     if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return false;
     for (int i = 0; i < subset_count; ++i) {
         const int state_index = state_indices ? state_indices[i] : i;
-        if (state_index < 0 || state_index >= BSSN_STATE_COUNT)
+        if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY)
             return false;
         if (!ctx.resident_host_clean[bank][state_index])
             return false;
@@ -5008,7 +5368,7 @@ static void mark_resident_host_subset_clean(StepContext &ctx,
     if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
     for (int i = 0; i < subset_count; ++i) {
         const int state_index = state_indices ? state_indices[i] : i;
-        if (state_index >= 0 && state_index < BSSN_STATE_COUNT)
+        if (state_index >= 0 && state_index < BSSN_RESIDENT_STATE_CAPACITY)
             ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
     }
 }
@@ -5042,17 +5402,18 @@ static void update_state_ready(StepContext &ctx)
     ctx.state_ready = any_resident_bank_valid(ctx);
 }
 
-static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
+static void writeback_resident_bank_count(StepContext &ctx, int bank, size_t all, int state_count)
 {
     if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
     if (!ctx.resident_valid[bank]) return;
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
+    for (int i = 0; i < state_count; ++i) {
         if (!ctx.resident_host[bank][i]) return;
     }
     const bool profile = cuda_aux_profile_enabled();
     const double t0 = profile ? cuda_profile_now_ms() : 0.0;
     const size_t bytes = all * sizeof(double);
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    for (int i = 0; i < state_count; ++i) {
         CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i],
                                    ctx.d_resident[bank][i],
                                    bytes, cudaMemcpyDeviceToHost));
@@ -5063,11 +5424,16 @@ static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
         CudaAuxProfileStats &stats = cuda_aux_profile_stats();
         stats.writeback_calls++;
         stats.writeback_ms += cuda_profile_now_ms() - t0;
-        stats.writeback_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+        stats.writeback_gb += (double)((size_t)state_count * bytes) / 1.0e9;
         cuda_aux_profile_maybe_log();
     }
 }
 
+static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
+{
+    writeback_resident_bank_count(ctx, bank, all, BSSN_STATE_COUNT);
+}
+
 static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
 {
     for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -5099,15 +5465,51 @@ static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size
     return best;
 }
 
-static void assign_resident_key(StepContext &ctx, int bank, double **host_key)
+static int choose_escalar_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
 {
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (b != avoid_bank && !ctx.resident_valid[b])
+            return b;
+    }
+
+    int best = -1;
+    unsigned long long best_age = 0;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (b == avoid_bank) continue;
+        if (best < 0 || ctx.resident_age[b] < best_age) {
+            best = b;
+            best_age = ctx.resident_age[b];
+        }
+    }
+    if (best < 0) best = 0;
+    writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
+    ctx.resident_valid[best] = false;
+    ctx.resident_host[best].fill(nullptr);
+    ctx.resident_host_clean[best].fill(0);
+    ctx.resident_age[best] = 0;
+    if (ctx.current_bank == best) {
+        ctx.current_bank = -1;
+        ctx.d_state_curr_mem = nullptr;
+        ctx.d_state_curr.fill(nullptr);
+    }
+    update_state_ready(ctx);
+    return best;
+}
+
+static void assign_resident_key_count(StepContext &ctx, int bank, double **host_key, int state_count)
+{
+    for (int i = 0; i < state_count; ++i) {
         ctx.resident_host[bank][i] = host_key[i];
     }
     set_resident_host_clean(ctx, bank, false);
     ctx.resident_age[bank] = ++ctx.resident_clock;
 }
 
+static void assign_resident_key(StepContext &ctx, int bank, double **host_key)
+{
+    assign_resident_key_count(ctx, bank, host_key, BSSN_STATE_COUNT);
+}
+
 static int ensure_resident_bank(StepContext &ctx,
                                 double **host_key,
                                 size_t all,
@@ -5147,6 +5549,47 @@ static int ensure_resident_bank(StepContext &ctx,
     return bank;
 }
 
+static int ensure_escalar_resident_bank(StepContext &ctx,
+                                        double **host_key,
+                                        size_t all,
+                                        bool upload_if_missing,
+                                        int avoid_bank = -1)
+{
+    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
+        if (ctx.current_bank >= 0)
+            return ctx.current_bank;
+        return 0;
+    }
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
+    if (bank >= 0) {
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        if (!ctx.resident_valid[bank] && upload_if_missing) {
+            bind_escalar_state_input_slots(ctx.d_resident[bank]);
+            upload_escalar_state_inputs(host_key, all);
+            CUDA_CHECK(cudaDeviceSynchronize());
+            ctx.resident_valid[bank] = true;
+            set_resident_host_clean(ctx, bank, true);
+        }
+        return bank;
+    }
+
+    bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_bank, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
+    if (upload_if_missing) {
+        bind_escalar_state_input_slots(ctx.d_resident[bank]);
+        upload_escalar_state_inputs(host_key, all);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        ctx.resident_valid[bank] = true;
+        set_resident_host_clean(ctx, bank, true);
+    } else {
+        ctx.resident_valid[bank] = false;
+        set_resident_host_clean(ctx, bank, false);
+    }
+    update_state_ready(ctx);
+    return bank;
+}
+
 static int reserve_resident_output_bank(StepContext &ctx,
                                         double **host_key,
                                         size_t all,
@@ -5167,6 +5610,62 @@ static int reserve_resident_output_bank(StepContext &ctx,
     return bank;
 }
 
+static int reserve_escalar_resident_output_bank(StepContext &ctx,
+                                                double **host_key,
+                                                size_t all,
+                                                int input_bank)
+{
+    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_ESCALAR_STATE_COUNT))
+        return input_bank;
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
+    if (bank < 0)
+        bank = choose_escalar_resident_bank_for_reuse(ctx, input_bank, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
+static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c);
+
+static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
+                                                         double **host_key,
+                                                         size_t all,
+                                                         int avoid_a,
+                                                         int avoid_b,
+                                                         int avoid_c)
+{
+    if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_ESCALAR_STATE_COUNT))
+        return avoid_a;
+    if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_ESCALAR_STATE_COUNT))
+        return avoid_b;
+    if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_ESCALAR_STATE_COUNT))
+        return avoid_c;
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
+    if (bank < 0) {
+        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+            if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) {
+                bank = b;
+                break;
+            }
+        }
+    }
+    if (bank < 0)
+        bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
 static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c)
 {
     return bank == avoid_a || bank == avoid_b || bank == avoid_c;
@@ -5256,7 +5755,7 @@ static int active_or_keyed_bank(StepContext &ctx,
     return 0;
 }
 
-static void launch_rhs_pipeline(int all, double eps, int co)
+static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false)
 {
     const double SYM = 1.0;
     const double ANTI = -1.0;
@@ -5336,6 +5835,11 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_gupxx), D(S_gupxy), D(S_gupxz),
         D(S_gupyy), D(S_gupyz), D(S_gupzz));
 
+    if (compute_escalar) {
+        gpu_escalar_sources(all);
+        gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all);
+    }
+
     if (co == 0) {
         kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
             D(S_Gamx), D(S_Gamy), D(S_Gamz),
@@ -5367,7 +5871,9 @@ static void launch_rhs_pipeline(int all, double eps, int co)
         D(S_gupxx), D(S_gupxy), D(S_gupxz),
         D(S_gupyy), D(S_gupyz), D(S_gupzz),
         D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
-        D(S_Kx), D(S_Ky), D(S_Kz),
+        compute_escalar ? D(S_trK_x) : D(S_Kx),
+        compute_escalar ? D(S_trK_y) : D(S_Ky),
+        compute_escalar ? D(S_trK_z) : D(S_Kz),
         D(S_Sx), D(S_Sy), D(S_Sz),
         D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
         D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
@@ -5552,7 +6058,9 @@ static void launch_rhs_pipeline(int all, double eps, int co)
             D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
             D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
             D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
-            D(S_Kx), D(S_Ky), D(S_Kz),
+            compute_escalar ? D(S_trK_x) : D(S_Kx),
+            compute_escalar ? D(S_trK_y) : D(S_Ky),
+            compute_escalar ? D(S_trK_z) : D(S_Kz),
             D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
             D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
             D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
@@ -5585,6 +6093,26 @@ static void download_state_outputs(double **state_host_out, size_t all)
     }
 }
 
+static void download_escalar_state_outputs(double **state_host_out, size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(state_host_out[i],
+                                   g_buf.slot[k_escalar_state_rhs_slots[i]],
+                                   bytes, cudaMemcpyDeviceToHost));
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+static void upload_escalar_state_inputs(double **state_host, size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_escalar_state_input_slots[i]],
+                                   state_host[i], bytes, cudaMemcpyHostToDevice));
+    }
+}
+
 static void download_constraint_outputs(double **constraint_host_out, size_t all)
 {
     const size_t bytes = all * sizeof(double);
@@ -5596,6 +6124,186 @@ static void download_constraint_outputs(double **constraint_host_out, size_t all
     }
 }
 
+extern "C"
+int bssn_escalar_cuda_rk4_substep(void *block_tag,
+                                  int *ex, double *X, double *Y, double *Z,
+                                  double **state_host_in,
+                                  double **state_host_out,
+                                  const double *propspeed,
+                                  const double *soa_flat,
+                                  const double *bbox,
+                                  double &dT,
+                                  double &T,
+                                  int &RK4,
+                                  int &apply_bam_bc,
+                                  int &Symmetry,
+	                                  int &Lev,
+	                                  double &eps,
+	                                  int &co,
+	                                  int &keep_resident_state,
+	                                  int &apply_enforce_ga,
+	    double &chitiny)
+{
+    (void)T;
+    if (RK4 < 0 || RK4 > 3) return 1;
+
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+
+    double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0;
+#ifdef fortran1
+    set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
+#endif
+#ifdef fortran2
+    SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
+#endif
+#ifdef fortran3
+    set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
+#endif
+    if (fabs(escalar_a2 - 3.0) > 1.0e-12 && g_dispatch.my_rank == 0) {
+        fprintf(stderr, "CUDA BSSN-EScalar currently supports FR a2=3 for EScalar_CC=2/3; got %.17g\n",
+                escalar_a2);
+        return 1;
+    }
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const size_t bytes = all * sizeof(double);
+    int touch_xmin = 0, touch_xmax = 0;
+    int touch_ymin = 0, touch_ymax = 0;
+    int touch_zmin = 0, touch_zmax = 0;
+
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
+    if (Lev > 0) {
+        compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
+                                     touch_xmin, touch_xmax,
+                                     touch_ymin, touch_ymax,
+                                     touch_zmin, touch_zmax);
+    }
+
+    StepContext &ctx = ensure_step_ctx(block_tag, all);
+    const bool use_resident_state = (keep_resident_state != 0);
+    int input_bank = -1;
+    int output_bank = -1;
+    if (use_resident_state) {
+        input_bank = ensure_escalar_resident_bank(ctx, state_host_in, all, true);
+        output_bank = reserve_escalar_resident_output_bank(ctx, state_host_out, all, input_bank);
+        mark_resident_current_bank(ctx, input_bank);
+        mark_resident_next_bank(ctx, output_bank);
+        bind_escalar_state_input_slots(ctx.d_resident[input_bank]);
+        bind_escalar_state_output_slots(ctx.d_resident[output_bank]);
+    } else {
+        upload_escalar_state_inputs(state_host_in, all);
+    }
+
+    if (apply_enforce_ga) {
+        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
+                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
+                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
+                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
+    }
+
+    if (RK4 == 0) {
+        if (use_resident_state) {
+            CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank],
+                                  (size_t)BSSN_ESCALAR_STATE_COUNT * bytes,
+                                  cudaMemcpyDeviceToDevice));
+        } else {
+            CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
+                                  (size_t)BSSN_STATE_COUNT * bytes,
+                                  cudaMemcpyDeviceToDevice));
+            CUDA_CHECK(cudaMemcpy(ctx.d_state0[24], g_buf.slot[S_Sphi],
+                                  bytes, cudaMemcpyDeviceToDevice));
+            CUDA_CHECK(cudaMemcpy(ctx.d_state0[25], g_buf.slot[S_Spi],
+                                  bytes, cudaMemcpyDeviceToDevice));
+        }
+    }
+
+    launch_rhs_pipeline((int)all, eps, co, true);
+
+    if (apply_bam_bc) {
+        for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
+            gpu_sommerfeld_routbam(g_buf.slot[k_escalar_state_input_slots[i]],
+                                   g_buf.slot[k_escalar_state_rhs_slots[i]],
+                                   propspeed[i],
+                                   soa_flat[3 * i + 0],
+                                   soa_flat[3 * i + 1],
+                                   soa_flat[3 * i + 2],
+                                   X, Y, Z, bbox, Symmetry);
+        }
+    }
+
+    gpu_escalar_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
+    if (Lev > 0) {
+        gpu_escalar_copy_patch_boundary_batch((int)all,
+                                              touch_xmin, touch_xmax,
+                                              touch_ymin, touch_ymax,
+                                              touch_zmin, touch_zmax);
+    }
+    if (use_resident_state) {
+        ctx.resident_valid[output_bank] = true;
+        ctx.resident_age[output_bank] = ++ctx.resident_clock;
+        set_resident_host_clean(ctx, output_bank, false);
+        mark_resident_current_bank(ctx, output_bank);
+        update_state_ready(ctx);
+    } else {
+        download_escalar_state_outputs(state_host_out, all);
+    }
+    if (RK4 == 3)
+        ctx.matter_ready = false;
+    return 0;
+}
+
+extern "C"
+int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
+                                          double **state_host_in,
+                                          double **constraint_host_out,
+                                          int &Symmetry,
+                                          int &Lev,
+                                          double &eps)
+{
+    if (!state_host_in || !constraint_host_out) return 1;
+
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+
+    double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0;
+#ifdef fortran1
+    set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
+#endif
+#ifdef fortran2
+    SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
+#endif
+#ifdef fortran3
+    set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
+#endif
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const size_t bytes = all * sizeof(double);
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0);
+    upload_escalar_state_inputs(state_host_in, all);
+    launch_rhs_pipeline((int)all, eps, 0, true);
+
+    #define D(s) g_buf.slot[s]
+    kern_escalar_constraint_fr<<<grid(all), BLK>>>(
+        D(S_chin1),
+        D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz),
+        D(S_trK),
+        D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
+        D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
+        D(S_rho),
+        D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
+        D(S_Sphi),
+        escalar_a2,
+        D(S_f_arr));
+    #undef D
+
+    download_constraint_outputs(constraint_host_out, all);
+    CUDA_CHECK(cudaMemcpy(constraint_host_out[7], g_buf.slot[S_f_arr],
+                          bytes, cudaMemcpyDeviceToHost));
+    (void)Lev;
+    return 0;
+}
+
 __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
                                               const double * __restrict__ src2,
                                               const double * __restrict__ src3,
@@ -6222,7 +6930,7 @@ static void copy_state_region_cuda(void *block_tag,
                                    cudaMemcpyKind kind,
                                    double **state_host_key = nullptr)
 {
-    if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
+    if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     const size_t pitch = (size_t)ex[0] * sizeof(double);
@@ -6264,7 +6972,7 @@ static void copy_state_region_packed_cuda(void *block_tag,
                                           cudaMemcpyKind kind,
                                           double **state_host_key = nullptr)
 {
-    if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return;
+    if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     const size_t src_pitch = (size_t)ex[0] * sizeof(double);
@@ -6310,7 +7018,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
                                                 cudaMemcpyKind kind,
                                                 double **state_host_key = nullptr)
 {
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
@@ -6353,12 +7061,12 @@ static void copy_state_region_packed_batch_cuda(void *block_tag,
     }
 }
 
-static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
+static void download_resident_state_count(void *block_tag, int *ex, double **state_host_out, int state_count)
 {
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
-    int bank = find_resident_bank(ctx, state_host_out);
+    int bank = find_resident_bank_count(ctx, state_host_out, state_count);
     if (bank < 0) {
         bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
     }
@@ -6371,9 +7079,9 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
         direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
     }
     if (direct_download) {
-        if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr))
+        if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
             return;
-        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        for (int i = 0; i < state_count; ++i) {
             CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
                                        bytes, cudaMemcpyDeviceToHost));
         }
@@ -6383,16 +7091,16 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
             CudaProfileStats &stats = cuda_profile_stats();
             stats.resident_download_calls++;
             stats.resident_download_ms += cuda_profile_now_ms() - t0;
-            stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+            stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9;
         }
         return;
     }
-    if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr))
+    if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
         return;
     CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
-                          (size_t)BSSN_STATE_COUNT * bytes,
+                          (size_t)state_count * bytes,
                           cudaMemcpyDeviceToHost));
-    for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+    for (int i = 0; i < state_count; ++i) {
         std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
     }
     set_resident_host_clean(ctx, bank, true);
@@ -6400,24 +7108,44 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos
         CudaProfileStats &stats = cuda_profile_stats();
         stats.resident_download_calls++;
         stats.resident_download_ms += cuda_profile_now_ms() - t0;
-        stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
+        stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9;
     }
 }
 
+static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
+{
+    download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
+}
+
+static bool download_resident_state_count_if_present(void *block_tag,
+                                                     int *ex,
+                                                     double **state_host_out,
+                                                     int state_count);
+
 static bool download_resident_state_if_present(void *block_tag, int *ex, double **state_host_out)
 {
+    return download_resident_state_count_if_present(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
+}
+
+static bool download_resident_state_count_if_present(void *block_tag,
+                                                     int *ex,
+                                                     double **state_host_out,
+                                                     int state_count)
+{
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
+        return false;
     auto it = g_step_ctx.find(block_tag);
     if (it == g_step_ctx.end()) return false;
 
     StepContext &ctx = it->second;
-    const int bank = find_resident_bank(ctx, state_host_out);
+    const int bank = find_resident_bank_count(ctx, state_host_out, state_count);
     if (bank < 0 || !ctx.resident_valid[bank])
         return false;
 
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     mark_resident_current_bank(ctx, bank);
-    if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr))
+    if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
         return true;
 
     static int direct_download = -1;
@@ -6426,16 +7154,16 @@ static bool download_resident_state_if_present(void *block_tag, int *ex, double
         direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
     }
     if (direct_download) {
-        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        for (int i = 0; i < state_count; ++i) {
             CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
                                        bytes, cudaMemcpyDeviceToHost));
         }
         CUDA_CHECK(cudaDeviceSynchronize());
     } else {
         CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
-                              (size_t)BSSN_STATE_COUNT * bytes,
+                              (size_t)state_count * bytes,
                               cudaMemcpyDeviceToHost));
-        for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
+        for (int i = 0; i < state_count; ++i) {
             std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
         }
     }
@@ -6454,17 +7182,17 @@ static void copy_state_subset(void *block_tag,
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     const size_t bytes = all * sizeof(double);
     StepContext &ctx = ensure_step_ctx(block_tag, all);
-    double **full_key = (subset_count == BSSN_STATE_COUNT) ? state_host : nullptr;
+    double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr;
     const int bank = active_or_keyed_bank(ctx, full_key, all,
                                           kind == cudaMemcpyHostToDevice);
     double *base_mem = ctx.d_resident_mem[bank];
-    int active_state_indices[BSSN_STATE_COUNT];
-    double *active_state_host[BSSN_STATE_COUNT];
+    int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
+    double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY];
     int active_count = 0;
 
     for (int i = 0; i < subset_count; ++i) {
         const int state_index = state_indices[i];
-        if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue;
+        if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) continue;
         if (kind == cudaMemcpyDeviceToHost &&
             ctx.resident_host_clean[bank][state_index])
             continue;
@@ -7095,6 +7823,31 @@ int bssn_cuda_download_resident_state(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_escalar_cuda_download_resident_state(void *block_tag,
+                                              int *ex,
+                                              double **state_host_out)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    download_resident_state_count(block_tag, ex, state_host_out, BSSN_ESCALAR_STATE_COUNT);
+    return 0;
+}
+
+extern "C"
+int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
+                                                       int *ex,
+                                                       double **state_host_out,
+                                                       int state_count)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
+        return 1;
+    download_resident_state_count_if_present(block_tag, ex, state_host_out, state_count);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_download_resident_state_if_present(void *block_tag,
                                                 int *ex,
@@ -7150,6 +7903,7 @@ int bssn_cuda_interp_state_point3(void *block_tag,
                                   double pz,
                                   int ordn,
                                   int symmetry,
+                                  double **state_host_key,
                                   const double *soa3,
                                   double *out3)
 {
@@ -7157,9 +7911,9 @@ int bssn_cuda_interp_state_point3(void *block_tag,
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
     if (!block_tag || !ex || !out3 || !soa3)
         return 1;
-    if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
-        state1 < 0 || state1 >= BSSN_STATE_COUNT ||
-        state2 < 0 || state2 >= BSSN_STATE_COUNT)
+    if (state0 < 0 || state0 >= BSSN_RESIDENT_STATE_CAPACITY ||
+        state1 < 0 || state1 >= BSSN_RESIDENT_STATE_CAPACITY ||
+        state2 < 0 || state2 >= BSSN_RESIDENT_STATE_CAPACITY)
         return 1;
     if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
         ordn <= 0 || ordn > 8 ||
@@ -7168,7 +7922,8 @@ int bssn_cuda_interp_state_point3(void *block_tag,
 
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
-    const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
+    const int interp_states[3] = {state0, state1, state2};
+    const int bank = find_resident_bank_subset(ctx, state_host_key, interp_states, 3);
     if (bank < 0 || !ctx.resident_valid[bank])
         return 1;
 
@@ -7354,7 +8109,7 @@ static void copy_state_device_batch(void *block_tag,
                                     int pack_not_unpack,
                                     double **state_host_key = nullptr)
 {
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
     if (sx <= 0 || sy <= 0 || sz <= 0) return;
 
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
@@ -7395,7 +8150,7 @@ static void copy_state_device_segments(void *block_tag,
                                        int pack_not_unpack,
                                        double **state_host_key = nullptr)
 {
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
     if (segment_count <= 0 || !segment_meta) return;
 
     int max_region_all = 0;
@@ -7445,7 +8200,7 @@ static void restrict_state_device_segments(void *block_tag,
                                            double **state_host_key = nullptr,
                                            const double *state_soa = nullptr)
 {
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
     if (segment_count <= 0 || !segment_meta || !device_buffer) return;
 
     int max_region_all = 0;
@@ -7484,7 +8239,7 @@ static void prolong_state_device_segments(void *block_tag,
                                           double **state_host_key = nullptr,
                                           const double *state_soa = nullptr)
 {
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
     if (segment_count <= 0 || !segment_meta || !device_buffer) return;
 
     int max_region_all = 0;
@@ -7712,7 +8467,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
     if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
     const int region_all = sx * sy * sz;
@@ -7739,7 +8494,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
     if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
@@ -7767,7 +8522,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
     if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
     const int region_all = sx * sy * sz;
@@ -7796,7 +8551,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1;
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
     if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
     StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
@@ -7845,6 +8600,7 @@ int bssn_cuda_upload_state_subset(void *block_tag,
 extern "C"
 int bssn_cuda_prepare_inter_time_level(void *block_tag,
                                        int *ex,
+                                       int state_count,
                                        double **src1_host_key,
                                        double **src2_host_key,
                                        double **src3_host_key,
@@ -7856,12 +8612,14 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
     const bool profile = cuda_aux_profile_enabled();
     const double t0 = profile ? cuda_profile_now_ms() : 0.0;
-    if (source_count != 2 && source_count != 3) return 1;
-    if (!resident_key_usable(src1_host_key) ||
-        !resident_key_usable(src2_host_key) ||
-        !resident_key_usable(dst_host_key))
+    if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
         return 1;
-    if (source_count == 3 && !resident_key_usable(src3_host_key))
+    if (source_count != 2 && source_count != 3) return 1;
+    if (!resident_key_usable_count(src1_host_key, state_count) ||
+        !resident_key_usable_count(src2_host_key, state_count) ||
+        !resident_key_usable_count(dst_host_key, state_count))
+        return 1;
+    if (source_count == 3 && !resident_key_usable_count(src3_host_key, state_count))
         return 1;
 
     double c1 = 0.0, c2 = 0.0, c3 = 0.0;
@@ -7887,21 +8645,32 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
 
     const size_t all = (size_t)ex[0] * ex[1] * ex[2];
     StepContext &ctx = ensure_step_ctx(block_tag, all);
-    const int src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
-    const int src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
-    const int src3_bank = (source_count == 3)
-                        ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank)
-                        : -1;
-    const int dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
-                                                               src1_bank, src2_bank, src3_bank);
+    int src1_bank, src2_bank, src3_bank, dst_bank;
+    if (state_count == BSSN_ESCALAR_STATE_COUNT) {
+        src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true);
+        src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank);
+        src3_bank = (source_count == 3)
+                  ? ensure_escalar_resident_bank(ctx, src3_host_key, all, true, src1_bank)
+                  : -1;
+        dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all,
+                                                                 src1_bank, src2_bank, src3_bank);
+    } else {
+        src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
+        src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
+        src3_bank = (source_count == 3)
+                  ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank)
+                  : -1;
+        dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
+                                                         src1_bank, src2_bank, src3_bank);
+    }
 
-    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
+    dim3 launch_grid((unsigned int)grid(all), (unsigned int)state_count);
     kern_prepare_inter_time_level<<<launch_grid, BLK>>>(
         ctx.d_resident_mem[src1_bank],
         ctx.d_resident_mem[src2_bank],
         (source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr,
         ctx.d_resident_mem[dst_bank],
-        c1, c2, c3, BSSN_STATE_COUNT, (int)all);
+        c1, c2, c3, state_count, (int)all);
     if (profile)
         cuda_profile_sync();
     ctx.resident_valid[dst_bank] = true;
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index 41e92f8..38f945a 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -7,6 +7,7 @@ extern "C" {
 
 enum {
     BSSN_CUDA_STATE_COUNT = 24,
+    BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
     BSSN_CUDA_MATTER_COUNT = 10
 };
 
@@ -55,6 +56,32 @@ int bssn_cuda_rk4_substep(void *block_tag,
                           int &apply_enforce_ga,
                           double &chitiny);
 
+int bssn_escalar_cuda_rk4_substep(void *block_tag,
+                                  int *ex, double *X, double *Y, double *Z,
+                                  double **state_host_in,
+                                  double **state_host_out,
+                                  const double *propspeed,
+                                  const double *soa_flat,
+                                  const double *bbox,
+                                  double &dT,
+                                  double &T,
+                                  int &RK4,
+                                  int &apply_bam_bc,
+                                  int &Symmetry,
+	                                  int &Lev,
+	                                  double &eps,
+	                                  int &co,
+	                                  int &keep_resident_state,
+	                                  int &apply_enforce_ga,
+	                                  double &chitiny);
+
+int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
+                                          double **state_host_in,
+                                          double **constraint_host_out,
+                                          int &Symmetry,
+                                          int &Lev,
+                                          double &eps);
+
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                         int state_index,
                                         double *host_state,
@@ -73,6 +100,15 @@ int bssn_cuda_download_resident_state(void *block_tag,
                                       int *ex,
                                       double **state_host_out);
 
+int bssn_escalar_cuda_download_resident_state(void *block_tag,
+                                              int *ex,
+                                              double **state_host_out);
+
+int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
+                                                       int *ex,
+                                                       double **state_host_out,
+                                                       int state_count);
+
 int bssn_cuda_download_resident_state_if_present(void *block_tag,
                                                 int *ex,
                                                 double **state_host_out);
@@ -103,6 +139,7 @@ int bssn_cuda_interp_state_point3(void *block_tag,
                                   double pz,
                                   int ordn,
                                   int symmetry,
+                                  double **state_host_key,
                                   const double *soa3,
                                   double *out3);
 
@@ -302,6 +339,7 @@ int bssn_cuda_upload_state_subset(void *block_tag,
 
 int bssn_cuda_prepare_inter_time_level(void *block_tag,
                                        int *ex,
+                                       int state_count,
                                        double **src1_host_key,
                                        double **src2_host_key,
                                        double **src3_host_key,
diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile
index 84fd4c0..93cf81c 100644
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -35,7 +35,6 @@ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
 endif
 
 TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -fprofile-instr-use=$(TP_PROFDATA) \
               -Dfortran3 -Dnewc $(MKL_INC)
 else
 ## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
diff --git a/makefile_and_run.py b/makefile_and_run.py
index d4ce33b..f539764 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -146,6 +146,7 @@ def _gpu_runtime_env():
         "AMSS_CUDA_AWARE_MPI": "1",
         "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
         "AMSS_CUDA_KEEP_ALL_LEVELS": "1",
+        "AMSS_CUDA_AMR_HOST_STAGED": "1",
         "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
         "AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
         "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
@@ -277,6 +278,7 @@ def run_ABE():
         print(f"   AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}")
         print(f"   AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
         print(f"   AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
+        print(f"   AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
         print(f"   AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
         print(f"   AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
         print(f"   AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")