diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index c98bfd8..fa14a2f 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -89,6 +89,16 @@ bool bssn_cuda_use_resident_sync(int lev)
 #endif
 }
 
+bool bssn_constraint_recompute_from_state(int lev, bool level0_cache_valid)
+{
+#if USE_CUDA_BSSN
+  return lev > 0 || !level0_cache_valid;
+#else
+  (void)level0_cache_valid;
+  return lev > 0;
+#endif
+}
+
 bool bssn_cuda_sync_subset(Block *cg,
                            int subset_count,
                            const int *state_indices,
@@ -361,6 +371,7 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
                        int a_levi, int maxli, int decni, double maxrexi, double drexi) 
                        : Courant(Couranti), StartTime(StartTimei), TotalTime(TotalTimei), 
                          DumpTime(DumpTimei), d2DumpTime(d2DumpTimei), CheckTime(CheckTimei), AnasTime(AnasTimei),
+                         cuda_level0_constraint_cache_valid(false),
                          Symmetry(Symmetryi), checkrun(checkruni), numepss(numepssi), numepsb(numepsbi), numepsh(numepshi),
 #ifdef With_AHF
                          xc(0), yc(0), zc(0), xr(0), yr(0), zr(0), trigger(0), dTT(0), dumpid(0),
@@ -2448,6 +2459,8 @@ void bssn_class::Evolve(int Steps)
 
   for (int ncount = 1; ncount < Steps + 1; ncount++)
   {
+    cuda_level0_constraint_cache_valid = false;
+
     // special for large mass ratio consideration
     //     if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) 
     //     { GH->levels=GH->movls; }
@@ -3337,6 +3350,8 @@ void bssn_class::Step(int lev, int YN)
 #else
   const bool use_cuda_resident_sync = false;
 #endif
+  const bool need_cuda_level0_constraint_cache =
+      (lev == 0) && (LastConsOut + dT * pow(0.5, Mymax(0, trfls)) >= AnasTime);
 
 // new code 2013-2-15, zjcao
 #if (MAPBH == 1)
@@ -3457,6 +3472,22 @@ void bssn_class::Step(int lev, int YN)
                  << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
             ERROR = 1;
           }
+          if (need_cuda_level0_constraint_cache)
+          {
+            double *constraint_out[7] = {
+                cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn]};
+            if (bssn_cuda_download_constraint_outputs(cg->shape, constraint_out))
+            {
+              cout << "CUDA predictor constraint download failed in domain: ("
+                   << cg->bbox[0] << ":" << cg->bbox[3] << ","
+                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
+                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
+              ERROR = 1;
+            }
+            else
+              cuda_level0_constraint_cache_valid = true;
+          }
           used_gpu_substep = true;
           used_gpu_resident_state = (keep_resident_state != 0);
         }
@@ -7734,7 +7765,7 @@ void bssn_class::Constraint_Out()
     for (int lev = 0; lev < GH->levels; lev++)
     {
       // make sure the data consistent for higher levels
-      if (lev > 0) // if the constrait quantities can be reused from the step rhs calculation
+      if (bssn_constraint_recompute_from_state(lev, cuda_level0_constraint_cache_valid))
       {
         double TRK4 = PhysTime;
         double ndeps = numepsb;
@@ -8237,7 +8268,7 @@ void bssn_class::Interp_Constraint(bool infg)
     for (int lev = 0; lev < GH->levels; lev++)
     {
       // make sure the data consistent for higher levels
-      if (lev > 0) // if the constrait quantities can be reused from the step rhs calculation
+      if (bssn_constraint_recompute_from_state(lev, cuda_level0_constraint_cache_valid))
       {
         double TRK4 = PhysTime;
         double ndeps = numepsb;
diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h
index 5a8eb2a..2609238 100644
--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -45,10 +45,11 @@ public:
        int checkrun;
        char checkfilename[50];
        int Steps;
-       double StartTime, TotalTime;
-       double AnasTime, DumpTime, d2DumpTime, CheckTime;
-       double LastAnas, LastConsOut;
-       double Courant;
+       double StartTime, TotalTime;
+       double AnasTime, DumpTime, d2DumpTime, CheckTime;
+       double LastAnas, LastConsOut;
+       bool cuda_level0_constraint_cache_valid;
+       double Courant;
        double numepss, numepsb, numepsh;
        int Symmetry;
        int maxl, decn;
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 310f014..ad31c7f 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -4997,6 +4997,17 @@ static void download_state_outputs(double **state_host_out, size_t all)
     }
 }
 
+static void download_constraint_outputs(double **constraint_host_out, size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
+                          (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
+                          cudaMemcpyDeviceToHost));
+    for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
+        std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
+    }
+}
+
 __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
                                              double * __restrict__ dst,
                                              int nx, int ny,
@@ -5818,6 +5829,17 @@ int bssn_cuda_download_resident_state(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_cuda_download_constraint_outputs(int *ex,
+                                          double **constraint_host_out)
+{
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    download_constraint_outputs(constraint_host_out, all);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                                int state_index,
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index ce8a334..55b6380 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -73,6 +73,9 @@ int bssn_cuda_download_resident_state(void *block_tag,
                                       int *ex,
                                       double **state_host_out);
 
+int bssn_cuda_download_constraint_outputs(int *ex,
+                                          double **constraint_host_out);
+
 int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
                                                int state_index,
                                                double *host_buffer,