Add two-node MPI launch configuration

Isolate TwoPuncture from ABE OMP settings
Use wall time for timestep logging
2026-03-30 21:13:46 +08:00 · 2026-03-30 21:00:20 +08:00 · 2026-03-30 20:38:41 +08:00 · 2026-03-30 20:34:34 +08:00
28 changed files with 1455 additions and 6935 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -13,14 +13,17 @@ import numpy

 ## Setting MPI processes and the output file directory

-File_directory   = "GW150914"                    ## output file directory
-Output_directory = "binary_output"               ## binary data file directory
-                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
-
-GPU_Calculation  = "no"                          ## Use GPU or not 
-                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
-CPU_Part         = 1.0
+File_directory   = "GW150914"                    ## output file directory
+Output_directory = "binary_output"               ## binary data file directory
+                                                 ## The file directory name should not be too long
+MPI_processes    = 64                            ## number of mpi processes used in the simulation
+OMP_Threads      = 3                             ## number of OpenMP threads used by each MPI process
+MPI_hosts        = ["localhost", "192.168.20.102"] ## MPI hosts for multi-node runs
+MPI_processes_per_node = 32                      ## MPI ranks launched on each node in MPI_hosts
+
+GPU_Calculation  = "no"                          ## Use GPU or not
+                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
+CPU_Part         = 1.0
 GPU_Part         = 0.0

 #################################################
@@ -50,7 +53,7 @@ Check_Time               = 100.0
 Dump_Time                = 100.0                  ## time inteval dT for dumping binary data
 D2_Dump_Time             = 100.0                  ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 0.1                    ## dump the puncture position and GW psi4 after dT"
-Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
+Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength

--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -23,20 +23,22 @@ using namespace std;
 #include <mpi.h>

 #include "misc.h"
-#include "macrodef.h"
-#ifdef USE_GPU
-extern void bssn_cuda_dump_stage_profile();
-#endif
+#include "macrodef.h"

 #ifndef ABEtype
 #error "not define ABEtype"
 #endif

-#if (ABEtype == 0)
-#include "bssn_class.h"
-
-#elif (ABEtype == 1)
-#include "bssnEScalar_class.h"
+#if (ABEtype == 0)
+
+#ifdef USE_GPU
+#include "bssn_gpu_class.h"
+#else
+#include "bssn_class.h"
+#endif
+
+#elif (ABEtype == 1)
+#include "bssnEScalar_class.h"

 #elif (ABEtype == 2)
 #include "Z4c_class.h"
@@ -472,13 +474,10 @@ int main(int argc, char *argv[])
            cout << endl;
      }

-	      ADM->Evolve(Steps);
-#ifdef USE_GPU
-	      bssn_cuda_dump_stage_profile();
-#endif
-	
-	      if (myrank == 0)
-	      {
+      ADM->Evolve(Steps);
+
+      if (myrank == 0)
+      {
            cout << endl;
            cout << " Total Evolve Time: "  << MPI_Wtime() - End_clock   << " seconds!" << endl;
            cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl;
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -9,12 +9,8 @@
 #include <new>
 using namespace std;

-#include "Block.h"
-#include "misc.h"
-#ifdef USE_GPU
-#include "bssn_gpu.h"
-#include "bssn_cuda_ops.h"
-#endif
+#include "Block.h"
+#include "misc.h"

 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
 {
@@ -99,19 +95,14 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
  }
 #endif
 }
-Block::~Block()
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == rank)
-  {
-#ifdef USE_GPU
-    bssn_gpu_clear_cached_device_buffers();
-    bssn_cuda_release_rk4_caches();
-    bssn_cuda_release_interp_caches();
-#endif
-    for (int i = 0; i < dim; i++)
-      delete[] X[i];
+Block::~Block()
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == rank)
+  {
+    for (int i = 0; i < dim; i++)
+      delete[] X[i];
    for (int i = 0; i < ingfs; i++)
      free(igfs[i]);
    delete[] igfs;
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -2,100 +2,29 @@
 #include <iostream>
 #include <iomanip>
 #include <fstream>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <cmath>
-#include <new>
-#include <map>
-#include <vector>
-using namespace std;
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <new>
+#include <vector>
+using namespace std;

-#include "misc.h"
-#include "MPatch.h"
-#include "Parallel.h"
-#include "fmisc.h"
-#include "bssn_cuda_ops.h"
-#ifdef INTERP_LB_PROFILE
-#include "interp_lb_profile.h"
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-extern int bssn_cuda_interp_points_batch(const int *ex,
-                                         const double *X, const double *Y, const double *Z,
-                                         const double *const *fields,
-                                         const double *soa_flat,
-                                         int num_var,
-                                         const double *px, const double *py, const double *pz,
-                                         int num_points,
-                                         int ordn,
-                                         int symmetry,
-                                         double *out) __attribute__((weak));
-#endif
-
-namespace
-{
-struct InterpVarDesc
-{
-  int sgfn;
-  double soa[dim];
-};
-
-struct InterpPlanKey
-{
-  const Patch *patch;
-  const double *x;
-  const double *y;
-  const double *z;
-  int NN;
-  int Symmetry;
-  int myrank;
-};
-
-struct InterpPlanKeyLess
-{
-  bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const
-  {
-    if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch;
-    if (lhs.x != rhs.x) return lhs.x < rhs.x;
-    if (lhs.y != rhs.y) return lhs.y < rhs.y;
-    if (lhs.z != rhs.z) return lhs.z < rhs.z;
-    if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN;
-    if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry;
-    return lhs.myrank < rhs.myrank;
-  }
-};
-
-struct CachedInterpPlan
-{
-  int nblocks;
-  vector<int> owner_rank;
-  vector<int> owner_block;
-  vector<vector<int> > block_points;
-  vector<vector<double> > block_px;
-  vector<vector<double> > block_py;
-  vector<vector<double> > block_pz;
-
-  CachedInterpPlan() : nblocks(0) {}
-};
-
-struct CachedInterpPlanEntry
-{
-  bool valid;
-  InterpPlanKey key;
-  vector<double> xvals;
-  vector<double> yvals;
-  vector<double> zvals;
-  CachedInterpPlan plan;
-
-  CachedInterpPlanEntry() : valid(false) {}
-};
-
-struct InterpBlockView
-{
-  Block *bp;
-  double llb[dim];
-  double uub[dim];
+#include "misc.h"
+#include "MPatch.h"
+#include "Parallel.h"
+#include "fmisc.h"
+#ifdef INTERP_LB_PROFILE
+#include "interp_lb_profile.h"
+#endif
+
+namespace
+{
+struct InterpBlockView
+{
+  Block *bp;
+  double llb[dim];
+  double uub[dim];
 };

 struct BlockBinIndex
@@ -225,10 +154,10 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
  index.valid = true;
 }

-int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
-{
-  if (!index.valid)
-    return -1;
+int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
+{
+  if (!index.valid)
+    return -1;

  const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
  const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
@@ -246,314 +175,13 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
  for (size_t bi = 0; bi < index.views.size(); bi++)
    if (point_in_block_view(index.views[bi], pox, DH))
      return int(bi);
-
-  return -1;
-}
-
-void collect_interp_vars(MyList<var> *VarList, vector<InterpVarDesc> &vars)
-{
-  vars.clear();
-  MyList<var> *varl = VarList;
-  while (varl)
-  {
-    InterpVarDesc desc;
-    desc.sgfn = varl->data->sgfn;
-    for (int d = 0; d < dim; ++d)
-      desc.soa[d] = varl->data->SoA[d];
-    vars.push_back(desc);
-    varl = varl->next;
-  }
-}
-
-bool should_try_cuda_interp(int ordn, int num_points, int num_var)
-{
-#if defined(__GNUC__) || defined(__clang__)
-  if (!bssn_cuda_interp_points_batch)
-    return false;
-#else
-  return false;
-#endif
-  if (ordn != 6)
-    return false;
-  if (num_points < 32)
-    return false;
-  return num_points * num_var >= 256;
-}
-
-CachedInterpPlanEntry &interp_plan_cache_entry()
-{
-  static CachedInterpPlanEntry cache;
-  return cache;
-}
-
-bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
-{
-  return lhs.patch == rhs.patch &&
-         lhs.NN == rhs.NN &&
-         lhs.Symmetry == rhs.Symmetry &&
-         lhs.myrank == rhs.myrank;
-}
-
-bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
-{
-  if (static_cast<int>(cache.xvals.size()) != NN ||
-      static_cast<int>(cache.yvals.size()) != NN ||
-      static_cast<int>(cache.zvals.size()) != NN)
-    return false;
-
-  for (int j = 0; j < NN; ++j)
-  {
-    if (cache.xvals[j] != XX[0][j] ||
-        cache.yvals[j] != XX[1][j] ||
-        cache.zvals[j] != XX[2][j])
-      return false;
-  }
-  return true;
-}
-
-CachedInterpPlan &get_cached_interp_plan(Patch *patch,
-                                         int NN, double **XX,
-                                         int Symmetry, int myrank,
-                                         const double *DH,
-                                         const BlockBinIndex &block_index,
-                                         bool report_bounds_here,
-                                         bool allow_missing_points)
-{
-  InterpPlanKey key;
-  key.patch = patch;
-  key.x = XX[0];
-  key.y = XX[1];
-  key.z = XX[2];
-  key.NN = NN;
-  key.Symmetry = Symmetry;
-  key.myrank = myrank;
-
-  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
-  if (cache.valid &&
-      same_interp_plan_key(cache.key, key) &&
-      same_interp_plan_points(cache, NN, XX) &&
-      cache.plan.nblocks == static_cast<int>(block_index.views.size()))
-    return cache.plan;
-
-  cache.valid = true;
-  cache.key = key;
-  cache.xvals.assign(XX[0], XX[0] + NN);
-  cache.yvals.assign(XX[1], XX[1] + NN);
-  cache.zvals.assign(XX[2], XX[2] + NN);
-  cache.plan = CachedInterpPlan();
-  CachedInterpPlan &plan = cache.plan;
-  plan.nblocks = static_cast<int>(block_index.views.size());
-  plan.owner_rank.assign(NN, -1);
-  plan.owner_block.assign(NN, -1);
-  plan.block_points.resize(plan.nblocks);
-  plan.block_px.resize(plan.nblocks);
-  plan.block_py.resize(plan.nblocks);
-  plan.block_pz.resize(plan.nblocks);
-
-  for (int j = 0; j < NN; ++j)
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; ++i)
-    {
-      pox[i] = XX[i][j];
-      if (report_bounds_here &&
-          (XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] ||
-           XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; ++k)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
-    {
-      Block *BP = block_index.views[block_i].bp;
-      plan.owner_rank[j] = BP->rank;
-      plan.owner_block[j] = block_i;
-      if (BP->rank == myrank)
-      {
-        plan.block_points[block_i].push_back(j);
-        plan.block_px[block_i].push_back(XX[0][j]);
-        plan.block_py[block_i].push_back(XX[1][j]);
-        plan.block_pz[block_i].push_back(XX[2][j]);
-      }
-    }
-  }
-
-  if (!allow_missing_points && report_bounds_here)
-  {
-    for (int j = 0; j < NN; ++j)
-    {
-      if (plan.owner_rank[j] >= 0)
-        continue;
-      cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int d = 0; d < dim; ++d)
-      {
-        cout << XX[d][j];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")";
-      }
-      cout << " on Patch (";
-      for (int d = 0; d < dim; ++d)
-      {
-        cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")--";
-      }
-      cout << "(";
-      for (int d = 0; d < dim; ++d)
-      {
-        cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  return plan;
-}
-
-void release_interp_plan_cache_internal()
-{
-  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
-  cache.valid = false;
-  cache.xvals.clear();
-  cache.yvals.clear();
-  cache.zvals.clear();
-  cache.plan = CachedInterpPlan();
-}
-
-bool run_cuda_interp_for_block(Block *BP,
-                               const vector<InterpVarDesc> &vars,
-                               const vector<int> &point_ids,
-                               const vector<double> &px,
-                               const vector<double> &py,
-                               const vector<double> &pz,
-                               double *Shellf,
-                               int num_var,
-                               int ordn,
-                               int Symmetry)
-{
-  if (!should_try_cuda_interp(ordn, static_cast<int>(point_ids.size()), num_var))
-    return false;
-
-  vector<const double *> field_ptrs(num_var);
-  vector<double> soa_flat(3 * num_var);
-  for (int v = 0; v < num_var; ++v)
-  {
-    field_ptrs[v] = BP->fgfs[vars[v].sgfn];
-    for (int d = 0; d < dim; ++d)
-      soa_flat[3 * v + d] = vars[v].soa[d];
-  }
-
-  const int npts = static_cast<int>(point_ids.size());
-  vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
-  if (bssn_cuda_interp_points_batch(BP->shape,
-                                    BP->X[0], BP->X[1], BP->X[2],
-                                    field_ptrs.data(),
-                                    soa_flat.data(),
-                                    num_var,
-                                    px.data(), py.data(), pz.data(),
-                                    npts,
-                                    ordn,
-                                    Symmetry,
-                                    out.data()) != 0)
-  {
-    return false;
-  }
-
-  for (int p = 0; p < npts; ++p)
-  {
-    const int j = point_ids[p];
-    memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var);
-  }
-  return true;
-}
-
-void run_cpu_interp_for_block(Block *BP,
-                              const vector<InterpVarDesc> &vars,
-                              const vector<int> &point_ids,
-                              const vector<double> &px,
-                              const vector<double> &py,
-                              const vector<double> &pz,
-                              double *Shellf,
-                              int num_var,
-                              int ordn,
-                              int Symmetry)
-{
-  for (size_t p = 0; p < point_ids.size(); ++p)
-  {
-    const int j = point_ids[p];
-    double x = px[p];
-    double y = py[p];
-    double z = pz[p];
-    int ordn_local = ordn;
-    int symmetry_local = Symmetry;
-    for (int v = 0; v < num_var; ++v)
-    {
-      f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
-                      BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v],
-                      x, y, z, ordn_local, const_cast<double *>(vars[v].soa), symmetry_local);
-    }
-  }
-}
-
-void interpolate_owned_points(MyList<var> *VarList,
-                              double *Shellf, int Symmetry,
-                              int ordn,
-                              const BlockBinIndex &block_index,
-                              const CachedInterpPlan &plan)
-{
-  vector<InterpVarDesc> vars;
-  collect_interp_vars(VarList, vars);
-  const int num_var = static_cast<int>(vars.size());
-
-  for (size_t bi = 0; bi < plan.block_points.size(); ++bi)
-  {
-    if (plan.block_points[bi].empty())
-      continue;
-
-    Block *BP = block_index.views[bi].bp;
-    bool done = run_cuda_interp_for_block(BP, vars,
-                                          plan.block_points[bi],
-                                          plan.block_px[bi],
-                                          plan.block_py[bi],
-                                          plan.block_pz[bi],
-                                          Shellf, num_var, ordn, Symmetry);
-    if (!done)
-      run_cpu_interp_for_block(BP, vars,
-                               plan.block_points[bi],
-                               plan.block_px[bi],
-                               plan.block_py[bi],
-                               plan.block_pz[bi],
-                               Shellf, num_var, ordn, Symmetry);
-  }
-}
-} // namespace
-
-void patch_release_interp_plan_cache()
-{
-  release_interp_plan_cache_internal();
-}
-
-Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
-{
+
+  return -1;
+}
+} // namespace
+
+Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
+{

  int hbuffer_width = buffer_width;
  if (lev == 0)
@@ -895,15 +523,60 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
-  const int *owner_rank = plan.owner_rank.data();
-
-  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
+  // owner_rank[j] records which MPI rank owns point j
+  // All ranks traverse the same block list so they all agree on ownership
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+  BlockBinIndex block_index;
+  build_block_bin_index(this, DH, block_index);
+
+  for (int j = 0; j < NN; j++) // run along points
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
+    if (block_i >= 0)
+    {
+      Block *BP = block_index.views[block_i].bp;
+      owner_rank[j] = BP->rank;
+      if (myrank == BP->rank)
+      {
+        //---> interpolation
+        varl = VarList;
+        int k = 0;
+        while (varl) // run along variables
+        {
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          varl = varl->next;
+          k++;
+        }
+      }
+    }
+  }

  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
@@ -958,8 +631,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
    }
  }
-
-}
+
+  delete[] owner_rank;
+}
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
                          double *Shellf, int Symmetry,
@@ -987,22 +661,102 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
-  const int *owner_rank = plan.owner_rank.data();
-
-  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
+  // owner_rank[j] records which MPI rank owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+  BlockBinIndex block_index;
+  build_block_bin_index(this, DH, block_index);
+
+  // --- Interpolation phase (identical to original) ---
+  for (int j = 0; j < NN; j++)
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
+    if (block_i >= 0)
+    {
+      Block *BP = block_index.views[block_i].bp;
+      owner_rank[j] = BP->rank;
+      if (myrank == BP->rank)
+      {
+        varl = VarList;
+        int k = 0;
+        while (varl)
+        {
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          varl = varl->next;
+          k++;
+        }
+      }
+    }
+  }

 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
  double t_interp_local = t_interp_end - t_interp_start;
 #endif

-  // --- Targeted point-to-point communication phase ---
+  // --- Error check for unfound points ---
+  for (int j = 0; j < NN; j++)
+  {
+    if (owner_rank[j] < 0 && myrank == 0)
+    {
+      cout << "ERROR: Patch::Interp_Points fails to find point (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << XX[d][j];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")";
+      }
+      cout << " on Patch (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[d] << "+" << lli[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")--";
+      }
+      cout << "(";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  // --- Targeted point-to-point communication phase ---
  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
  int *consumer_rank = new int[NN];
  {
@@ -1119,8 +873,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] send_offset;
  delete[] recv_offset;
  delete[] send_count;
-  delete[] recv_count;
-  delete[] consumer_rank;
+  delete[] recv_count;
+  delete[] consumer_rank;
+  delete[] owner_rank;

 #ifdef INTERP_LB_PROFILE
  {
@@ -1168,20 +923,64 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // Build global-to-local rank translation for Comm_here
-  MPI_Group world_group, local_group;
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
-  MPI_Comm_group(Comm_here, &local_group);
+  // owner_rank[j] stores the global rank that owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;

-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true);
-  const int *owner_rank = plan.owner_rank.data();
-
-  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
+  // Build global-to-local rank translation for Comm_here
+  MPI_Group world_group, local_group;
+  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+  MPI_Comm_group(Comm_here, &local_group);
+
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+  BlockBinIndex block_index;
+  build_block_bin_index(this, DH, block_index);
+
+  for (int j = 0; j < NN; j++) // run along points
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
+    if (block_i >= 0)
+    {
+      Block *BP = block_index.views[block_i].bp;
+      owner_rank[j] = BP->rank;
+      if (myrank == BP->rank)
+      {
+        //---> interpolation
+        varl = VarList;
+        int k = 0;
+        while (varl) // run along variables
+        {
+          f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                          pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+          varl = varl->next;
+          k++;
+        }
+      }
+    }
+  }

  // Collect unique global owner ranks and translate to local ranks in Comm_here
  // Then broadcast each owner's points via MPI_Bcast on Comm_here
@@ -1209,9 +1008,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }

-  MPI_Group_free(&world_group);
-  MPI_Group_free(&local_group);
-}
+  MPI_Group_free(&world_group);
+  MPI_Group_free(&local_group);
+  delete[] owner_rank;
+}
 void Patch::checkBlock()
 {
  int myrank;
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -8,7 +8,7 @@
 #include "var.h"
 #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width

-class Patch
+class Patch
 {

 public:
@@ -50,8 +50,6 @@ public:
                         double *Shellf, int Symmetry, MPI_Comm Comm_here);
   void Find_Maximum(MyList<var> *VarList, double *XX,
                     double *Shellf, MPI_Comm Comm_here);
-};
-
-void patch_release_interp_plan_cache();
-
-#endif /* PATCH_H */
+};
+
+#endif /* PATCH_H */
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -89,12 +89,9 @@ namespace Parallel
  void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
                   int Symmetry);
-  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context);
-  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
-  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
-  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
-  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
+  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
+  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);

  struct SyncCache {
    bool valid;
@@ -108,13 +105,12 @@ namespace Parallel
    int *send_buf_caps;
    int *recv_buf_caps;
    MPI_Request *reqs;
-    MPI_Status *stats;
-    int max_reqs;
-    bool lengths_valid;
-    int lengths_var_count;
-    int *tc_req_node;
-    int *tc_req_is_recv;
-    int *tc_completed;
+    MPI_Status *stats;
+    int max_reqs;
+    bool lengths_valid;
+    int *tc_req_node;
+    int *tc_req_is_recv;
+    int *tc_completed;
    SyncCache();
    void invalidate();
    void destroy();
@@ -125,20 +121,19 @@ namespace Parallel
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);

-  struct AsyncSyncState {
-    int req_no;
-    bool active;
-    int mpi_tag;
-    int *req_node;
-    int *req_is_recv;
-    int pending_recv;
-    AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {}
-  };
+  struct AsyncSyncState {
+    int req_no;
+    bool active;
+    int *req_node;
+    int *req_is_recv;
+    int pending_recv;
+    AsyncSyncState() : req_no(0), active(false), req_node(0), req_is_recv(0), pending_recv(0) {}
+  };

-  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
-                  SyncCache &cache, AsyncSyncState &state);
-  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
-                   MyList<var> *VarList, int Symmetry, bool unpack_to_host = true);
+  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
+                  SyncCache &cache, AsyncSyncState &state);
+  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
+                   MyList<var> *VarList, int Symmetry);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -14,8 +14,7 @@ using namespace std;
 #include <string.h>
 #endif

-#include <time.h>
-#include <unistd.h>
+#include <time.h>

 #include "macrodef.h"
 #include "misc.h"
@@ -29,12 +28,8 @@ using namespace std;
 #include "rungekutta4_rout.h"
 #include "sommerfeld_rout.h"
 #include "getnp4.h"
-#include "shellfunctions.h"
-#include "parameters.h"
-#ifdef USE_GPU
-#include "bssn_macro.h"
-#include "bssn_gpu.h"
-#endif
+#include "shellfunctions.h"
+#include "parameters.h"

 #ifdef With_AHF
 #include "derivatives.h"
@@ -745,12 +740,11 @@ void bssn_class::Initialize()
  // Initialize sync caches (per-level, for predictor and corrector)
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
-  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
-  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
-  sync_cache_psi4 = new Parallel::SyncCache[GH->levels];
-}
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
+  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
+  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
+}

 //================================================================================================

@@ -762,8 +756,8 @@ void bssn_class::Initialize()

 //================================================================================================

-bssn_class::~bssn_class()
-{
+bssn_class::~bssn_class()
+{
 #ifdef With_AHF
  AHList->clearList();
  AHDList->clearList();
@@ -1020,30 +1014,12 @@ bssn_class::~bssn_class()
      sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
  }
-  if (sync_cache_rp_fine)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_rp_fine[i].destroy();
-    delete[] sync_cache_rp_fine;
-  }
-  if (sync_cache_restrict)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_restrict[i].destroy();
-    delete[] sync_cache_restrict;
-  }
-  if (sync_cache_outbd)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_outbd[i].destroy();
-    delete[] sync_cache_outbd;
-  }
-  if (sync_cache_psi4)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_psi4[i].destroy();
-    delete[] sync_cache_psi4;
-  }
+  if (sync_cache_rp_fine)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_rp_fine[i].destroy();
+    delete[] sync_cache_rp_fine;
+  }

  delete GH;
 #ifdef WithShell
@@ -1076,25 +1052,8 @@ bssn_class::~bssn_class()
  delete ConVMonitor;
  delete Waveshell;

-  delete CheckPoint;
-}
-
-void bssn_class::InvalidateSyncCaches()
-{
-  if (!GH)
-    return;
-
-  for (int il = 0; il < GH->levels; il++)
-  {
-    sync_cache_pre[il].invalidate();
-    sync_cache_cor[il].invalidate();
-    sync_cache_rp_coarse[il].invalidate();
-    sync_cache_rp_fine[il].invalidate();
-    sync_cache_restrict[il].invalidate();
-    sync_cache_outbd[il].invalidate();
-    sync_cache_psi4[il].invalidate();
-  }
-}
+  delete CheckPoint;
+}

 //================================================================================================

@@ -2075,12 +2034,11 @@ void bssn_class::Read_Ansorg()

 //================================================================================================

-void bssn_class::Evolve(int Steps)
-{
-  clock_t prev_clock, curr_clock;
-  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
-  LastAnas = 0;
-  LastConsOut = 0;
+void bssn_class::Evolve(int Steps)
+{
+  double prev_clock = 0.0, curr_clock = 0.0;
+  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
+  LastAnas = 0;
 #if 0
 //initial checkpoint for special uasge
     {
@@ -2187,14 +2145,14 @@ void bssn_class::Evolve(int Steps)

  GH->settrfls(trfls);

-  for (int ncount = 1; ncount < Steps + 1; ncount++)
-  {
+  for (int ncount = 1; ncount < Steps + 1; ncount++)
+  {
    // special for large mass ratio consideration
    //     if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) 
    //     { GH->levels=GH->movls; }

-    if (myrank == 0)
-      curr_clock = clock();
+    if (myrank == 0)
+      curr_clock = MPI_Wtime();
 #if (PSTR == 0)
    RecursiveStep(0);
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
@@ -2247,16 +2205,16 @@ void bssn_class::Evolve(int Steps)
      }
    }

-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << endl;
-      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
-           << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-      // cout << endl;
-    }
+    if (myrank == 0)
+    {
+      prev_clock = curr_clock;
+      curr_clock = MPI_Wtime();
+      cout << endl;
+      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
+           << " Computer used " << (curr_clock - prev_clock)
+           << " seconds! " << endl;
+      // cout << endl;
+    }

    if (PhysTime >= TotalTime)
      break;
@@ -2265,7 +2223,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    InvalidateSyncCaches();
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif

 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2338,21 +2296,18 @@ void bssn_class::Evolve(int Steps)
    ////////////////////////////////////////////////////////////

    // When LastCheck >= CheckTime, perform runtime checks and output status data
-    if (LastCheck >= CheckTime)
-    {
-      LastCheck = 0;
-
-      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
+    if (LastCheck >= CheckTime)
+    {
+      LastCheck = 0;
+
+      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
      CheckPoint->writecheck_cgh(PhysTime, GH);
 #ifdef WithShell
      CheckPoint->writecheck_sh(PhysTime, SH);
-#endif
-      CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
-    }
-
-    // Keep output/analysis phases aligned across ranks before the next coarse step.
-    MPI_Barrier(MPI_COMM_WORLD);
-  }
+#endif
+      CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
+    }
+  }
  /*
  #ifdef With_AHF
  // final apparent horizon finding
@@ -2486,7 +2441,7 @@ void bssn_class::RecursiveStep(int lev)
  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  InvalidateSyncCaches();
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }

@@ -2665,7 +2620,7 @@ void bssn_class::ParallelStep()
  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  InvalidateSyncCaches();
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
 #endif
 }

@@ -2832,7 +2787,7 @@ void bssn_class::ParallelStep()
        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-        InvalidateSyncCaches();
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }

        //               a_stream.clear();
        //               a_stream.str("");
@@ -2847,7 +2802,7 @@ void bssn_class::ParallelStep()
      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-      InvalidateSyncCaches();
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }

      //               a_stream.clear();
      //               a_stream.str("");
@@ -2866,7 +2821,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          InvalidateSyncCaches();
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -2882,7 +2837,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          InvalidateSyncCaches();
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -3071,14 +3026,9 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps

 #if (PSTR == 0)
 #if 1
-void bssn_class::Step(int lev, int YN)
-{
-#ifdef USE_GPU
-  Step_MainPath_GPU(lev, YN);
-  return;
-#endif
-
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
+void bssn_class::Step(int lev, int YN)
+{
+  setpbh(BH_num, Porg0, Mass, BH_num_input);

  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));

@@ -6298,7 +6248,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
 for(int ilev=GH->levels-1;ilev>lev;ilev--)
    RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
 #else
-  Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]);
+  Parallel::Sync(GH->PatL[lev], DG_List, Symmetry);
 #endif

 #ifdef WithShell
@@ -6953,10 +6903,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
 {
  LastAnas += dT_lev;

-  if (LastAnas >= AnasTime)
-  {
-#ifdef Point_Psi4
-#error "not support parallel levels yet"
+  if (LastAnas >= AnasTime)
+  {
+#ifdef Point_Psi4
+#error "not support parallel levels yet"
    // Gam_ijk and R_ij have been calculated in Interp_Constraint()
    double SYM = 1, ANT = -1;
    for (int levh = lev; levh < GH->levels; levh++)
@@ -7300,9 +7250,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)

 //================================================================================================

-void bssn_class::Constraint_Out()
-{
-  LastConsOut += dT * pow(0.5, Mymax(0, trfls));
+void bssn_class::Constraint_Out()
+{
+  LastConsOut += dT * pow(0.5, Mymax(0, trfls));

  if (LastConsOut >= AnasTime)
  // Constraint violation
@@ -7322,15 +7272,12 @@ void bssn_class::Constraint_Out()
          MyList<Block> *BP = Pp->data->blb;
          while (BP)
          {
-            Block *cg = BP->data;
-            if (myrank == cg->rank)
-            {
-#ifdef USE_GPU
-              gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Constraint_Out);
-#else
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+            Block *cg = BP->data;
+            if (myrank == cg->rank)
+            {
+              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7358,12 +7305,11 @@ void bssn_class::Constraint_Out()
                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                 cg->fgfs[Cons_Ham->sgfn],
-                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, pre);
-#endif
-            }
+                                 cg->fgfs[Cons_Ham->sgfn],
+                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                 Symmetry, lev, ndeps, pre);
+            }
            if (BP == Pp->data->ble)
              break;
            BP = BP->next;
@@ -7371,7 +7317,7 @@ void bssn_class::Constraint_Out()
          Pp = Pp->next;
        }
      }
-      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]");
+      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
    }
 #ifdef WithShell
    if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -7593,7 +7539,7 @@ void bssn_class::AH_Prepare_derivatives()
      }
      Pp = Pp->next;
    }
-    Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives");
+    Parallel::Sync(GH->PatL[lev], AHDList, Symmetry);
  }
 }

@@ -7829,15 +7775,12 @@ void bssn_class::Interp_Constraint(bool infg)
          MyList<Block> *BP = Pp->data->blb;
          while (BP)
          {
-            Block *cg = BP->data;
-            if (myrank == cg->rank)
-            {
-#ifdef USE_GPU
-              gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Interp_Constraint);
-#else
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+            Block *cg = BP->data;
+            if (myrank == cg->rank)
+            {
+              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7865,12 +7808,11 @@ void bssn_class::Interp_Constraint(bool infg)
                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                 cg->fgfs[Cons_Ham->sgfn],
-                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, pre);
-#endif
-            }
+                                 cg->fgfs[Cons_Ham->sgfn],
+                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                 Symmetry, lev, ndeps, pre);
+            }
            if (BP == Pp->data->ble)
              break;
            BP = BP->next;
@@ -7878,7 +7820,7 @@ void bssn_class::Interp_Constraint(bool infg)
          Pp = Pp->next;
        }
      }
-      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]");
+      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
    }
 #ifdef WithShell
    if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -8136,7 +8078,7 @@ void bssn_class::Compute_Constraint()
        Pp = Pp->next;
      }
    }
-    Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]");
+    Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
  }
  // prolong restrict constraint quantities
  for (lev = GH->levels - 1; lev > 0; lev--)
@@ -8449,18 +8391,12 @@ void bssn_class::Enforce_algcon(int lev, int fg)

 //================================================================================================

-bool bssn_class::check_Stdin_Abort() 
-{
-    // Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
-    // should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can
-    // fail when rank 0 polls/consumes it.
-    if (!isatty(STDIN_FILENO)) {
-        return false;
-    }
-
-    fd_set readfds;
-
-    struct timeval timeout;
+bool bssn_class::check_Stdin_Abort() 
+{
+
+    fd_set readfds;
+
+    struct timeval timeout;

    FD_ZERO(&readfds);
    FD_SET(STDIN_FILENO, &readfds);
@@ -8469,17 +8405,14 @@ bool bssn_class::check_Stdin_Abort()
    timeout.tv_sec = 0;
    timeout.tv_usec = 0;

-    int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
-    if (activity <= 0) {
-        return false;
-    }
-
-    if (FD_ISSET(STDIN_FILENO, &readfds)) {
-        string input_abort;
-        if (cin >> input_abort) {
-            if (input_abort == "stop") {
-                return true;
-            }
+    int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
+
+    if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
+        string input_abort;
+        if (cin >> input_abort) {
+            if (input_abort == "stop") {
+                return true;
+            }
        }
    }

--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -128,11 +128,10 @@ public:

       Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
-       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
-       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
-       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
-       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
-       Parallel::SyncCache *sync_cache_psi4;       // cached Psi4 sync on PatL[lev]
+       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
+       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
+       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
+       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong

       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
@@ -172,20 +171,16 @@ public:
       
       bool check_Stdin_Abort(); 

-       virtual void Setup_Initial_Data_Cao();
-       virtual void Setup_Initial_Data_Lousto();
-       virtual void Initialize();
-       virtual void Read_Ansorg();
-       virtual void Read_Pablo() {};
-       void InvalidateSyncCaches();
-       virtual void Compute_Psi4(int lev);
-       virtual void Step(int lev, int YN);
-#ifdef USE_GPU
-       void Step_MainPath_GPU(int lev, int YN);
-#endif
-       virtual void Interp_Constraint(bool infg);
-       virtual void Constraint_Out();
-       virtual void Compute_Constraint();
+       virtual void Setup_Initial_Data_Cao();
+       virtual void Setup_Initial_Data_Lousto();
+       virtual void Initialize();
+       virtual void Read_Ansorg();
+       virtual void Read_Pablo() {};
+       virtual void Compute_Psi4(int lev);
+       virtual void Step(int lev, int YN);
+       virtual void Interp_Constraint(bool infg);
+       virtual void Constraint_Out();
+       virtual void Compute_Constraint();

 #ifdef With_AHF
 protected:
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -1,68 +0,0 @@
-#ifndef BSSN_CUDA_OPS_H
-#define BSSN_CUDA_OPS_H
-
-int bssn_cuda_enforce_ga(int *ex,
-                         double *dxx, double *gxy, double *gxz,
-                         double *dyy, double *gyz, double *dzz,
-                         double *Axx, double *Axy, double *Axz,
-                         double *Ayy, double *Ayz, double *Azz);
-
-int bssn_cuda_rk4_boundary_var(int *ex, double dT,
-                               const double *X, const double *Y, const double *Z,
-                               double xmin, double ymin, double zmin,
-                               double xmax, double ymax, double zmax,
-                               const double *state0,
-                               const double *phi_field,
-                               const double *lap_field,
-                               const double *boundary_src,
-                               double *stage_data,
-                               double *rhs_accum,
-                               double propspeed,
-                               const double SoA[3],
-                               int symmetry,
-                               int lev,
-                               int rk_stage,
-                               bool force_host_boundary_fix,
-                               bool download_to_host = true);
-
-int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
-                                 const double *X, const double *Y, const double *Z,
-                                 double xmin, double ymin, double zmin,
-                                 double xmax, double ymax, double zmax,
-                                 int symmetry,
-                                 const double *const *state0_list,
-                                 double *const *stage_data_list,
-                                 double *const *rhs_accum_list,
-                                 int num_var,
-                                 int rk_stage,
-                                 bool download_to_host = false);
-
-int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
-int bssn_cuda_download_buffer(int *ex, double *host_ptr);
-void bssn_cuda_release_rk4_caches();
-void bssn_cuda_release_interp_caches();
-
-int bssn_cuda_prolong3_pack(int wei,
-                            const double *llbc, const double *uubc, const int *extc, const double *func,
-                            const double *llbf, const double *uubf, const int *extf, double *funf,
-                            const double *llbp, const double *uubp,
-                            const double *SoA, int symmetry);
-
-int bssn_cuda_restrict3_pack(int wei,
-                             const double *llbc, const double *uubc, const int *extc, double *func,
-                             const double *llbf, const double *uubf, const int *extf, const double *funf,
-                             const double *llbr, const double *uubr,
-                             const double *SoA, int symmetry);
-
-int bssn_cuda_interp_points_batch(const int *ex,
-                                  const double *X, const double *Y, const double *Z,
-                                  const double *const *fields,
-                                  const double *soa_flat,
-                                  int num_var,
-                                  const double *px, const double *py, const double *pz,
-                                  int num_points,
-                                  int ordn,
-                                  int symmetry,
-                                  double *out);
-
-#endif
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -1,936 +0,0 @@
-#include "macrodef.h"
-
-#ifdef USE_GPU
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <cstdlib>
-#include <iomanip>
-#include <vector>
-
-#include "bssn_class.h"
-#include "bssn_cuda_ops.h"
-#include "bssn_gpu.h"
-#include "bssn_macro.h"
-
-namespace
-{
-enum StageProfileMetric
-{
-  STAGE_PROFILE_TOTAL = 0,
-  STAGE_PROFILE_RHS,
-  STAGE_PROFILE_RUN_STAGE,
-  STAGE_PROFILE_RUN_STAGE_DEVICE,
-  STAGE_PROFILE_RUN_STAGE_HOST_FIX,
-  STAGE_PROFILE_LOWERBOUND,
-  STAGE_PROFILE_ENSURE,
-  STAGE_PROFILE_DOWNLOAD,
-  STAGE_PROFILE_CLEAR_CACHE,
-  STAGE_PROFILE_SYNC_START,
-  STAGE_PROFILE_SYNC_FINISH,
-  STAGE_PROFILE_REFRESH,
-  STAGE_PROFILE_COUNT
-};
-
-static const int kStageProfileMaxLevels = 32;
-
-struct StageProfileStore
-{
-  bool env_checked;
-  bool enabled;
-  int calls[kStageProfileMaxLevels];
-  double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
-};
-
-StageProfileStore &stage_profile_store()
-{
-  static StageProfileStore store = {};
-  return store;
-}
-
-bool stage_profile_enabled()
-{
-  StageProfileStore &store = stage_profile_store();
-  if (!store.env_checked)
-  {
-    const char *env = getenv("AMSS_GPU_STAGE_TIMING");
-    store.enabled = (env && env[0] && strcmp(env, "0") != 0);
-    store.env_checked = true;
-  }
-  return store.enabled;
-}
-
-void stage_profile_note_call(int lev)
-{
-  if (lev >= 0 && lev < kStageProfileMaxLevels)
-    stage_profile_store().calls[lev]++;
-}
-
-void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
-{
-  if (lev >= 0 && lev < kStageProfileMaxLevels)
-    stage_profile_store().metric[lev][metric] += seconds;
-}
-
-const char *stage_profile_metric_name(StageProfileMetric metric)
-{
-  switch (metric)
-  {
-  case STAGE_PROFILE_TOTAL:
-    return "total";
-  case STAGE_PROFILE_RHS:
-    return "rhs";
-  case STAGE_PROFILE_RUN_STAGE:
-    return "run_stage";
-  case STAGE_PROFILE_RUN_STAGE_DEVICE:
-    return "run_stage_dev";
-  case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
-    return "run_stage_host";
-  case STAGE_PROFILE_LOWERBOUND:
-    return "lower";
-  case STAGE_PROFILE_ENSURE:
-    return "ensure";
-  case STAGE_PROFILE_DOWNLOAD:
-    return "download";
-  case STAGE_PROFILE_CLEAR_CACHE:
-    return "clear_cache";
-  case STAGE_PROFILE_SYNC_START:
-    return "sync_start";
-  case STAGE_PROFILE_SYNC_FINISH:
-    return "sync_finish";
-  case STAGE_PROFILE_REFRESH:
-    return "refresh";
-  default:
-    return "unknown";
-  }
-}
-} // namespace
-
-void bssn_cuda_dump_stage_profile()
-{
-  if (!stage_profile_enabled())
-    return;
-
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  StageProfileStore &store = stage_profile_store();
-  int global_calls_sum[kStageProfileMaxLevels] = {};
-  double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
-  double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
-
-  MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
-  MPI_Reduce(store.metric[0], global_metric_sum[0],
-             kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
-             MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-  MPI_Reduce(store.metric[0], global_metric_max[0],
-             kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
-             MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-
-  if (myrank != 0)
-    return;
-
-  cout << endl;
-  cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
-  cout << " lev  calls";
-  for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
-    cout << "  " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
-  cout << endl;
-
-  for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
-  {
-    if (global_calls_sum[lev] == 0)
-      continue;
-
-    cout << setw(4) << lev << "  " << setw(5) << global_calls_sum[lev];
-    for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
-    {
-      cout << "  "
-           << setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
-           << "/"
-           << setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
-    }
-    cout << endl;
-  }
-  cout << endl;
-}
-
-void bssn_class::Step_MainPath_GPU(int lev, int YN)
-{
-#ifdef WithShell
-#error "Step_MainPath_GPU currently supports Patch grids only."
-#endif
-
-  const bool profile_enabled = stage_profile_enabled();
-  const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
-  if (profile_enabled)
-    stage_profile_note_call(lev);
-
-  if (bssn_gpu_bind_process_device(myrank))
-  {
-    cerr << "GPU device bind failure on MPI rank " << myrank << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    bssn_gpu_clear_cached_device_buffers();
-    stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-  }
-  else
-    bssn_gpu_clear_cached_device_buffers();
-
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
-
-  const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
-
-#if (MAPBH == 1)
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      for (int ith = 0; ith < 3; ith++)
-        Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
-      if (Symmetry > 0)
-        Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
-      if (Symmetry == 2)
-      {
-        Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
-        Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
-      }
-    }
-  }
-
-  if (lev == a_lev)
-    AnalysisStuff(lev, dT_lev);
-#endif
-
-#ifdef With_AHF
-  AH_Step_Find(lev, dT_lev);
-#endif
-
-  const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
-  (void)BB;
-  double ndeps = (lev < GH->movls) ? numepsb : numepss;
-  double TRK4 = PhysTime;
-  int iter_count = 0;
-  int pre = 0, cor = 1;
-  int ERROR = 0;
-  const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);
-
-  auto run_stage_on_block =
-      [&](Block *cg, Patch *patch, MyList<var> *state0_list,
-          MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
-          MyList<var> *rhs_list, int rk_stage) {
-        MyList<var> *varl0 = state0_list;
-        MyList<var> *varlb = boundary_src_list;
-        MyList<var> *varls = stage_data_list;
-        MyList<var> *varlr = rhs_list;
-        std::vector<const double *> batch_state0;
-        std::vector<double *> batch_stage;
-        std::vector<double *> batch_rhs;
-
-        while (varl0)
-        {
-          const bool force_host_boundary_fix = false;
-          const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
-          if (can_batch_device_path)
-          {
-            batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
-            batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
-            batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
-            varl0 = varl0->next;
-            varlb = varlb->next;
-            varls = varls->next;
-            varlr = varlr->next;
-            continue;
-          }
-
-          const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
-          if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
-                                         cg->X[0], cg->X[1], cg->X[2],
-                                         patch->bbox[0], patch->bbox[1], patch->bbox[2],
-                                         patch->bbox[3], patch->bbox[4], patch->bbox[5],
-                                         cg->fgfs[varl0->data->sgfn],
-                                         cg->fgfs[phi0->sgfn],
-                                         cg->fgfs[Lap0->sgfn],
-                                         cg->fgfs[varlb->data->sgfn],
-                                         cg->fgfs[varls->data->sgfn],
-                                         cg->fgfs[varlr->data->sgfn],
-                                         varl0->data->propspeed,
-                                         varl0->data->SoA,
-                                         Symmetry, lev, rk_stage,
-                                         force_host_boundary_fix, false))
-          {
-            cerr << "GPU rk4/boundary failure: lev=" << lev
-                 << " rk_stage=" << rk_stage
-                 << " var=" << varl0->data->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-            break;
-          }
-          if (profile_enabled)
-          {
-            stage_profile_add(lev,
-                              force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
-                                                      : STAGE_PROFILE_RUN_STAGE_DEVICE,
-                              MPI_Wtime() - var_begin);
-          }
-          varl0 = varl0->next;
-          varlb = varlb->next;
-          varls = varls->next;
-          varlr = varlr->next;
-        }
-
-        if (!ERROR && !batch_state0.empty())
-        {
-          const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
-          if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
-                                           cg->X[0], cg->X[1], cg->X[2],
-                                           patch->bbox[0], patch->bbox[1], patch->bbox[2],
-                                           patch->bbox[3], patch->bbox[4], patch->bbox[5],
-                                           Symmetry,
-                                           &batch_state0[0],
-                                           &batch_stage[0],
-                                           &batch_rhs[0],
-                                           static_cast<int>(batch_state0.size()),
-                                           rk_stage, false))
-          {
-            cerr << "GPU rk4/boundary batch failure: lev=" << lev
-                 << " rk_stage=" << rk_stage
-                 << " vars=" << batch_state0.size()
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-          }
-          else if (profile_enabled)
-          {
-            stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
-          }
-        }
-      };
-
-  auto stage_download_var_list =
-      [&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
-        std::vector<double *> batch_host_ptrs;
-        std::vector<MyList<var> *> batch_vars;
-        while (var_list)
-        {
-          double *host_ptr = cg->fgfs[var_list->data->sgfn];
-          if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
-          {
-            var_list = var_list->next;
-            continue;
-          }
-          batch_host_ptrs.push_back(host_ptr);
-          batch_vars.push_back(var_list);
-          var_list = var_list->next;
-        }
-        if (!batch_host_ptrs.empty() &&
-            bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
-                                           static_cast<int>(batch_host_ptrs.size())))
-        {
-          for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
-          {
-            if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
-            {
-              cerr << "GPU stage download failure: lev=" << lev
-                   << " var=" << batch_vars[i]->data->name
-                   << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-              ERROR = 1;
-              break;
-            }
-          }
-        }
-      };
-
-  auto stage_download_patch_list =
-      [&](MyList<var> *var_list, bool skip_unmapped) {
-        MyList<Patch> *patch_it = GH->PatL[lev];
-        while (patch_it)
-        {
-          MyList<Block> *block_it = patch_it->data->blb;
-          while (block_it)
-          {
-            Block *cg = block_it->data;
-            if (myrank == cg->rank)
-              stage_download_var_list(cg, var_list, skip_unmapped);
-
-            if (block_it == patch_it->data->ble)
-              break;
-            block_it = block_it->next;
-          }
-          if (ERROR)
-            break;
-          patch_it = patch_it->next;
-        }
-      };
-
-  auto ensure_stage_device_var_list =
-      [&](Block *cg, MyList<var> *var_list) {
-        const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
-        while (var_list)
-        {
-          double *host_ptr = cg->fgfs[var_list->data->sgfn];
-          if (!bssn_gpu_find_device_buffer(host_ptr) &&
-              bssn_gpu_stage_upload_buffer(host_ptr, n))
-          {
-            cerr << "GPU state ensure failure: lev=" << lev
-                 << " var=" << var_list->data->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-            break;
-          }
-          var_list = var_list->next;
-        }
-      };
-
-  auto refresh_synced_device_regions =
-      [&](Block *cg, MyList<var> *var_list, Parallel::SyncCache &cache) {
-        std::vector<Parallel::gridseg *> local_segments;
-        for (int node = 0; node < cache.cpusize; ++node)
-        {
-          MyList<Parallel::gridseg> *seg = cache.combined_dst[node];
-          while (seg)
-          {
-            if (seg->data && seg->data->Bg == cg)
-              local_segments.push_back(seg->data);
-            seg = seg->next;
-          }
-        }
-
-        if (local_segments.empty())
-          return;
-
-        const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
-        while (var_list)
-        {
-          double *host_ptr = cg->fgfs[var_list->data->sgfn];
-          if (!bssn_gpu_find_device_buffer(host_ptr))
-          {
-            if (bssn_gpu_stage_upload_buffer(host_ptr, n))
-            {
-              cerr << "GPU sync refresh upload failure: lev=" << lev
-                   << " var=" << var_list->data->name
-                   << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-              ERROR = 1;
-              break;
-            }
-          }
-          else
-          {
-            for (size_t i = 0; i < local_segments.size(); ++i)
-            {
-              Parallel::gridseg *seg = local_segments[i];
-              if (bssn_gpu_stage_upload_region(host_ptr,
-                                               cg->shape,
-                                               cg->bbox,
-                                               cg->bbox + dim,
-                                               seg->shape,
-                                               seg->llb))
-              {
-                cerr << "GPU sync region refresh failure: lev=" << lev
-                     << " var=" << var_list->data->name
-                     << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                     << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                     << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-                ERROR = 1;
-                break;
-              }
-            }
-            if (ERROR)
-              break;
-          }
-          var_list = var_list->next;
-        }
-      };
-
-  auto refresh_stage_device_after_sync =
-      [&](MyList<var> *var_list, Parallel::SyncCache &cache) {
-        MyList<Patch> *patch_it = GH->PatL[lev];
-        while (patch_it)
-        {
-          MyList<Block> *block_it = patch_it->data->blb;
-          while (block_it)
-          {
-            Block *cg = block_it->data;
-            if (myrank == cg->rank)
-              refresh_synced_device_regions(cg, var_list, cache);
-
-            if (block_it == patch_it->data->ble)
-              break;
-            block_it = block_it->next;
-          }
-          if (ERROR)
-            break;
-          patch_it = patch_it->next;
-        }
-      };
-
-  auto refresh_stage_host_before_sync =
-      [&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
-        if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
-          return false;
-
-        MyList<Patch> *patch_it = GH->PatL[lev];
-        while (patch_it)
-        {
-          MyList<Block> *block_it = patch_it->data->blb;
-          while (block_it)
-          {
-            Block *cg = block_it->data;
-            if (myrank == cg->rank)
-            {
-              std::vector<Parallel::gridseg *> local_segments;
-              MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
-              while (seg)
-              {
-                if (seg->data && seg->data->Bg == cg)
-                  local_segments.push_back(seg->data);
-                seg = seg->next;
-              }
-
-              if (!local_segments.empty())
-              {
-                MyList<var> *var_it = var_list;
-                while (var_it)
-                {
-                  double *host_ptr = cg->fgfs[var_it->data->sgfn];
-                  for (size_t i = 0; i < local_segments.size(); ++i)
-                  {
-                    Parallel::gridseg *src_seg = local_segments[i];
-                    if (bssn_gpu_stage_download_region(host_ptr,
-                                                       cg->shape,
-                                                       cg->bbox,
-                                                       cg->bbox + dim,
-                                                       src_seg->shape,
-                                                       src_seg->llb))
-                    {
-                      cerr << "GPU sync region download failure: lev=" << lev
-                           << " var=" << var_it->data->name
-                           << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                           << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                           << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-                      ERROR = 1;
-                      return true;
-                    }
-                  }
-                  var_it = var_it->next;
-                }
-              }
-            }
-
-            if (block_it == patch_it->data->ble)
-              break;
-            block_it = block_it->next;
-          }
-          patch_it = patch_it->next;
-        }
-
-        return true;
-      };
-
-  auto can_pack_sync_from_device =
-      [&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
-        if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
-          return false;
-
-        MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
-        while (seg)
-        {
-          MyList<var> *var_it = var_list;
-          while (var_it)
-          {
-            if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
-              return false;
-            var_it = var_it->next;
-          }
-          seg = seg->next;
-        }
-        return true;
-      };
-
-  MyList<Patch> *Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      if (myrank == cg->rank)
-      {
-        double t0 = 0.0;
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
-          ERROR = 1;
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
-
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
-
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
-        {
-          cerr << "GPU lowerbound failure: lev=" << lev
-               << " rk_stage=" << iter_count
-               << " var=" << phi->name
-               << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-               << cg->bbox[1] << ":" << cg->bbox[4] << ","
-               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-          ERROR = 1;
-        }
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
-      }
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  if (!ERROR)
-  {
-    if (!keep_stage_sync_on_device)
-    {
-      double t0 = 0.0;
-      if (profile_enabled)
-        t0 = MPI_Wtime();
-      stage_download_patch_list(SynchList_pre, false);
-      if (profile_enabled)
-        stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
-      if (!ERROR)
-      {
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        bssn_gpu_clear_cached_device_buffers();
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-      }
-    }
-  }
-
-  MPI_Request err_req_pre;
-  {
-    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
-  }
-
-  Parallel::AsyncSyncState async_pre;
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-    stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
-  }
-  else
-    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
-                          !keep_stage_sync_on_device);
-    stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
-  }
-  else
-    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
-                          !keep_stage_sync_on_device);
-  if (!ERROR && !keep_stage_sync_on_device)
-  {
-    if (profile_enabled)
-    {
-      const double t0 = MPI_Wtime();
-      refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
-      stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
-    }
-    else
-      refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
-  }
-
-  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
-  if (ERROR)
-  {
-    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
-    if (myrank == 0)
-    {
-      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
-                              << ", lev = " << lev << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-#if (MAPBH == 0)
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-      if (Symmetry > 0)
-        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
-      if (Symmetry == 2)
-      {
-        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
-        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
-      }
-    }
-  }
-
-  if (lev == a_lev)
-    AnalysisStuff(lev, dT_lev);
-#endif
-
-  for (iter_count = 1; iter_count < 4; iter_count++)
-  {
-    if (iter_count == 1 || iter_count == 3)
-      TRK4 += dT_lev / 2;
-
-    Pp = GH->PatL[lev];
-    while (Pp)
-    {
-      MyList<Block> *BP = Pp->data->blb;
-      while (BP)
-      {
-        Block *cg = BP->data;
-        if (myrank == cg->rank)
-        {
-          double t0 = 0.0;
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          ensure_stage_device_var_list(cg, SynchList_pre);
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
-
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
-            ERROR = 1;
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
-
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
-
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
-          {
-          cerr << "GPU lowerbound failure: lev=" << lev
-               << " rk_stage=" << iter_count
-               << " var=" << phi1->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-          }
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
-        }
-
-        if (BP == Pp->data->ble)
-          break;
-        BP = BP->next;
-      }
-      Pp = Pp->next;
-    }
-
-    if (!ERROR)
-    {
-      if (!keep_stage_sync_on_device)
-      {
-        double t0 = 0.0;
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        stage_download_patch_list(SynchList_cor, false);
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
-        if (!ERROR)
-        {
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          bssn_gpu_clear_cached_device_buffers();
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-        }
-      }
-    }
-
-    MPI_Request err_req_cor;
-    {
-      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
-    }
-
-    Parallel::AsyncSyncState async_cor;
-    if (profile_enabled)
-    {
-      const double t0 = MPI_Wtime();
-      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-      stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
-    }
-    else
-      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-    if (profile_enabled)
-    {
-      const double t0 = MPI_Wtime();
-      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
-                            !keep_stage_sync_on_device);
-      stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
-    }
-    else
-      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
-                            !keep_stage_sync_on_device);
-    if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
-    {
-      if (profile_enabled)
-      {
-        const double t0 = MPI_Wtime();
-        refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
-        stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
-      }
-      else
-        refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
-    }
-
-    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
-    if (ERROR)
-    {
-      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
-                                << " variables at t = " << PhysTime
-                                << ", lev = " << lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-#if (MAPBH == 0)
-    if (BH_num > 0 && lev == GH->levels - 1)
-    {
-      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
-      for (int ithBH = 0; ithBH < BH_num; ithBH++)
-      {
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-        if (Symmetry > 0)
-          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
-        if (Symmetry == 2)
-        {
-          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
-          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
-        }
-      }
-    }
-#endif
-
-    if (iter_count < 3)
-    {
-      Pp = GH->PatL[lev];
-      while (Pp)
-      {
-        MyList<Block> *BP = Pp->data->blb;
-        while (BP)
-        {
-          BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
-          if (BP == Pp->data->ble)
-            break;
-          BP = BP->next;
-        }
-        Pp = Pp->next;
-      }
-
-#if (MAPBH == 0)
-      if (BH_num > 0 && lev == GH->levels - 1)
-      {
-        for (int ithBH = 0; ithBH < BH_num; ithBH++)
-        {
-          Porg[ithBH][0] = Porg1[ithBH][0];
-          Porg[ithBH][1] = Porg1[ithBH][1];
-          Porg[ithBH][2] = Porg1[ithBH][2];
-        }
-      }
-#endif
-    }
-  }
-
-#if (RPS == 0)
-  RestrictProlong(lev, YN, BB);
-#endif
-
-  Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      cg->swapList(StateList, SynchList_cor, myrank);
-      cg->swapList(OldStateList, SynchList_cor, myrank);
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  if (!ERROR && keep_stage_sync_on_device)
-  {
-    // After the swaps above, only StateList points at arrays updated during this step.
-    // OldStateList/SynchList_cor remain valid on host because their backing arrays were
-    // read-only during the RK step, and SynchList_pre is reused only as scratch later.
-    const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
-    stage_download_patch_list(StateList, true);
-    if (profile_enabled)
-      stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
-  }
-
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    bssn_gpu_clear_cached_device_buffers();
-    stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-  }
-  else
-    bssn_gpu_clear_cached_device_buffers();
-
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      Porg0[ithBH][0] = Porg1[ithBH][0];
-      Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-
-  if (profile_enabled)
-    stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
-}
-
-#endif
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -4,8 +4,10 @@
 #include "bssn_macro.h"
 #include "macrodef.fh"

-#define GRID_DIM 256
-#define BLOCK_DIM 128
+#define DEVICE_ID 0
+// #define DEVICE_ID_BY_MPI_RANK
+#define GRID_DIM 256
+#define BLOCK_DIM 128

 #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
 #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
@@ -63,45 +65,9 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
            int &Symmetry, int &Lev, double &eps, int &co);

-int gpu_rhs_ss(RHS_SS_PARA);
-
-int bssn_gpu_bind_process_device(int mpi_rank);
-void bssn_gpu_clear_cached_device_buffers();
-void bssn_gpu_release_pinned_host_buffers();
-const double *bssn_gpu_find_device_buffer(const double *host_ptr);
-void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
-void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
-int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
-int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
-int bssn_gpu_stage_upload_region(const double *host_ptr,
-                                 const int *full_shape,
-                                 const double *full_llb,
-                                 const double *full_uub,
-                                 const int *region_shape,
-                                 const double *region_llb);
-int bssn_gpu_stage_download_region(double *host_ptr,
-                                   const int *full_shape,
-                                   const double *full_llb,
-                                   const double *full_uub,
-                                   const int *region_shape,
-                                   const double *region_llb);
-int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
-                                             const int *full_shape,
-                                             const double *full_llb,
-                                             const double *full_uub,
-                                             const int *region_shape,
-                                             const double *region_llb,
-                                             double *host_dst_ptr);
-int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
-                                           double *host_dst_ptr,
-                                           const int *full_shape,
-                                           const double *full_llb,
-                                           const double *full_uub,
-                                           const int *region_shape,
-                                           const double *region_llb);
-int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
-
-/** Init GPU side data in GPUMeta. */
-// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
+int gpu_rhs_ss(RHS_SS_PARA);
+
+/** Init GPU side data in GPUMeta. */
+// void init_fluid_meta_gpu(GPUMeta *gpu_meta);

 #endif
--- a/AMSS_NCKU_source/bssn_gpu_class.C
+++ b/AMSS_NCKU_source/bssn_gpu_class.C
@@ -1891,7 +1891,7 @@ void bssn_class::Read_Ansorg()
 void bssn_class::Evolve(int Steps)
 {

-  clock_t prev_clock, curr_clock;
+  double prev_clock = 0.0, curr_clock = 0.0;
  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
  LastAnas = 0;
 #if 0
@@ -2035,10 +2035,12 @@ void bssn_class::Evolve(int Steps)

  GH->settrfls(trfls);

-  for (int ncount = 1; ncount < Steps + 1; ncount++)
-  {
-    cout << "Before Step: " << ncount << " My Rank: " << myrank 
-         << " takes " << MPI_Wtime() - beg_time << " seconds!" << endl;
+  for (int ncount = 1; ncount < Steps + 1; ncount++)
+  {
+    if (myrank == 0)
+      curr_clock = MPI_Wtime();
+    cout << "Before Step: " << ncount << " My Rank: " << myrank 
+         << " takes " << MPI_Wtime() - beg_time << " seconds!" << endl;
    beg_time = MPI_Wtime();
 #if (PSTR == 0)
    RecursiveStep(0);
@@ -2095,10 +2097,10 @@ void bssn_class::Evolve(int Steps)

    if (myrank == 0)
    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << "Timestep # " << ncount << ": integrating to time: " << PhysTime << endl;
-      cout << "used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds!" << endl;
+      prev_clock = curr_clock;
+      curr_clock = MPI_Wtime();
+      cout << "Timestep # " << ncount << ": integrating to time: " << PhysTime << endl;
+      cout << "used " << (curr_clock - prev_clock) << " seconds!" << endl;
    }

    if (PhysTime >= TotalTime)
--- a/AMSS_NCKU_source/bssn_macro.h
+++ b/AMSS_NCKU_source/bssn_macro.h
@@ -65,10 +65,9 @@ if(TIME_COUNT_EACH_RANK == 1){\
     }\
 }

-//3---------------------GPU---------------------
-#define CALLED_BY_STEP 0
-#define CALLED_BY_CONSTRAINT 1
-#define CALLED_BY_CONSTRAINT_CONS_ONLY 2
+//3---------------------GPU---------------------
+#define CALLED_BY_STEP 0
+#define CALLED_BY_CONSTRAINT 1

 #define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre

--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
@@ -2,6 +2,24 @@
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
+
+#ifdef _OPENMP
+#define BSSN_OMP_TASK_GROUP_BEGIN \
+    _Pragma("omp parallel")       \
+    {                             \
+        _Pragma("omp single nowait") \
+        {
+#define BSSN_OMP_TASK_CALL(...) \
+            _Pragma("omp task") { __VA_ARGS__; }
+#define BSSN_OMP_TASK_GROUP_END \
+            _Pragma("omp taskwait") \
+        } \
+    }
+#else
+#define BSSN_OMP_TASK_GROUP_BEGIN {
+#define BSSN_OMP_TASK_CALL(...) { __VA_ARGS__; }
+#define BSSN_OMP_TASK_GROUP_END }
+#endif
 // 0-based i,j,k
 // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
 // ex(1)=nx, ex(2)=ny, ex(3)=nz
@@ -108,18 +126,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
            chin1[i] = chi[i] + 1.0;
        }
        // 9ms //
-        fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
-        fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev);
-        fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev);
-        fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
-        fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev);
-        fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev);
-        fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev);
-        fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
-        fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
+        BSSN_OMP_TASK_GROUP_BEGIN
+        BSSN_OMP_TASK_CALL(fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_GROUP_END

        // 3ms //
        for(int i=0;i<all;i+=1){
@@ -316,15 +336,14 @@ int f_compute_rhs_bssn(int *ex, double &T,
                        );
        }
        // 22.3ms //
-        fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
-        X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
-        fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,
-        X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
-        fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,
-        X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev);
-        fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
-        fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
+        BSSN_OMP_TASK_GROUP_BEGIN
+        BSSN_OMP_TASK_CALL(fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_GROUP_END

        // Fused: fxx/Gamxa + Gamma_rhs part 2 (2 loops -> 1)
        for(int i=0;i<all;i+=1){
@@ -1022,16 +1041,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );

            #if (GAUGE == 2)
-            {
-                const double chi_sqrt = sqrt(chin1[i]);
-                const double damping = ONE - chi_sqrt;
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
            #else
-            {
-                const double damping = ONE - chin1[i];
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
            #endif

            dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
@@ -1047,16 +1059,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );

            #if (GAUGE == 4)
-            {
-                const double chi_sqrt = sqrt(chin1[i]);
-                const double damping = ONE - chi_sqrt;
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
            #else
-            {
-                const double damping = ONE - chin1[i];
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
            #endif

            betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
@@ -1077,30 +1082,32 @@ int f_compute_rhs_bssn(int *ex, double &T,
            #endif
        }
        // advection + KO dissipation with shared symmetry buffer
-        lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
-        lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
-        lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps);
-        lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
-        lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
-        lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
-        lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
-        lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
+        BSSN_OMP_TASK_GROUP_BEGIN
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps))
+        BSSN_OMP_TASK_GROUP_END
        // 2ms //
        if(co==0){
            for (int i=0;i<all;i+=1) {
@@ -1147,12 +1154,14 @@ int f_compute_rhs_bssn(int *ex, double &T,
            }

            // 1ms //
-            fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
-            fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0);
-            fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0);
-            fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
-            fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
-            fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
+            BSSN_OMP_TASK_GROUP_BEGIN
+            BSSN_OMP_TASK_CALL(fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_GROUP_END
            // 7ms //
            for (int i=0;i<all;i+=1) {
                gxxx[i] = gxxx[i] - (  Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -23,14 +23,10 @@ using namespace std;
 #include <mpi.h>

 #include "macrodef.h"
-#include "misc.h"
-#include "cgh.h"
-#include "Parallel.h"
-#include "parameters.h"
-#ifdef USE_GPU
-#include "bssn_gpu.h"
-#include "bssn_cuda_ops.h"
-#endif
+#include "misc.h"
+#include "cgh.h"
+#include "Parallel.h"
+#include "parameters.h"

 //================================================================================================

@@ -885,20 +881,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
      tmPat = construct_patchlist(lev, Symmetry);
      // tmPat construction completes
      Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-#ifdef USE_GPU
-      bssn_gpu_clear_cached_device_buffers();
-      bssn_gpu_release_pinned_host_buffers();
-      bssn_cuda_release_rk4_caches();
-      bssn_cuda_release_interp_caches();
-      patch_release_interp_plan_cache();
-#endif
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
 #if (RPB == 1)
      Parallel::destroypsuList_bam(bdsul[lev]);
      Parallel::destroypsuList_bam(rsul[lev]);
@@ -921,20 +910,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
      tmPat = construct_patchlist(lev, Symmetry);
      // tmPat construction completes
      Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-#ifdef USE_GPU
-      bssn_gpu_clear_cached_device_buffers();
-      bssn_gpu_release_pinned_host_buffers();
-      bssn_cuda_release_rk4_caches();
-      bssn_cuda_release_interp_caches();
-      patch_release_interp_plan_cache();
-#endif
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
 #if (RPB == 1)
 #error "not support yet"
 #endif
@@ -1536,20 +1518,13 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
  tmPat = construct_patchlist(lev, Symmetry);
  // tmPat construction completes
  Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-#ifdef USE_GPU
-  bssn_gpu_clear_cached_device_buffers();
-  bssn_gpu_release_pinned_host_buffers();
-  bssn_cuda_release_rk4_caches();
-  bssn_cuda_release_interp_caches();
-  patch_release_interp_plan_cache();
-#endif
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
 }
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
 #warning "recompose_cgh_Onelevel is not implimented yet"
@@ -1565,21 +1540,14 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
  // tmPat construction completes
  Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
  misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
-
-#ifdef USE_GPU
-  bssn_gpu_clear_cached_device_buffers();
-  bssn_gpu_release_pinned_host_buffers();
-  bssn_cuda_release_rk4_caches();
-  bssn_cuda_release_interp_caches();
-  patch_release_interp_plan_cache();
-#endif
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
+
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
 }


--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -41,8 +41,8 @@ void fdderivs(const int ex[3],
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;

-    static double *fh = NULL;
-    static size_t cap = 0;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

    if (fh_size > cap) {
        free(fh);
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -50,8 +50,8 @@ void fderivs(const int ex[3],
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
-    static double *fh = NULL;
-    static size_t cap = 0;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

    if (fh_size > cap) {
        free(fh);
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -43,7 +43,13 @@ void lopsided_kodis(const int ex[3],
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;

-    double *fh = (double*)malloc(fh_size * sizeof(double));
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
    if (!fh) return;

    symmetry_bd(3, ex, f, fh, SoA);
@@ -243,6 +249,4 @@ void lopsided_kodis(const int ex[3],
            }
        }
    }
-
-    free(fh);
 }
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -16,7 +16,7 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) $(OPENMP_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
@@ -26,12 +26,12 @@ else


 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) $(OPENMP_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
-
-.SUFFIXES: .o .f90 .C .for .cu
+
+.SUFFIXES: .o .f90 .C .for .cu

 .f90.o:
 	$(f90) $(f90appflags) -c $< -o $@
@@ -105,12 +105,13 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
 	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
 	   
-C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
-           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
-	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
-	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
-           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
+C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
+           cgh.o surface_integral.o ShellPatch.o\
+	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
+	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
+           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
+	   NullShellPatch2_Evo.o \
+	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o

 F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -142,7 +143,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o

 TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 

-CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
+CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o

 # file dependences
 $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -9,7 +9,6 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
-CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart

 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -25,13 +24,14 @@ ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif

-LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
-
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
 PGO_MODE ?= opt

+## OpenMP switch for C/C++ kernels
+OPENMP_FLAGS ?= -qopenmp
+
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
 ##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
--- a/AMSS_NCKU_source/scalar_class.C
+++ b/AMSS_NCKU_source/scalar_class.C
@@ -317,9 +317,9 @@ void scalar_class::Setup_Initial_Data()
 #endif
  }
 }
-void scalar_class::Evolve(int Steps)
-{
-  clock_t prev_clock, curr_clock;
+void scalar_class::Evolve(int Steps)
+{
+  double prev_clock = 0.0, curr_clock = 0.0;
  double LastDump = 0.0, LastCheck = 0.0;
  LastAnas = 0;

@@ -327,8 +327,8 @@ void scalar_class::Evolve(int Steps)

  for (int ncount = 1; ncount < Steps + 1; ncount++)
  {
-    if (myrank == 0)
-      curr_clock = clock();
+    if (myrank == 0)
+      curr_clock = MPI_Wtime();
    RecursiveStep(0);

    LastDump += dT_mon;
@@ -343,13 +343,13 @@ void scalar_class::Evolve(int Steps)
 #endif
      LastDump = 0;
    }
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime
-           << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl;
-    }
+    if (myrank == 0)
+    {
+      prev_clock = curr_clock;
+      curr_clock = MPI_Wtime();
+      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime
+           << " Computer used " << (curr_clock - prev_clock) << " seconds! " << endl;
+    }
    if (PhysTime >= TotalTime)
      break;
  }
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -180,64 +180,19 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
 //|============================================================================
 //| Destructor
 //|============================================================================
-surface_integral::~surface_integral()
-{
-  release_cached_buffers();
-  delete[] nx_g;
-  delete[] ny_g;
-  delete[] nz_g;
-  delete[] arcostheta;
-#ifdef GaussInt
-  delete[] wtcostheta;
-#endif
-}
-
-void surface_integral::get_surface_points(double rex, double **pox)
-{
-  SpherePointCache &cache = sphere_point_cache[rex];
-  if (!cache.pox[0])
-  {
-    for (int i = 0; i < 3; ++i)
-      cache.pox[i] = new double[n_tot];
-    for (int n = 0; n < n_tot; ++n)
-    {
-      cache.pox[0][n] = rex * nx_g[n];
-      cache.pox[1][n] = rex * ny_g[n];
-      cache.pox[2][n] = rex * nz_g[n];
-    }
-  }
-
-  pox[0] = cache.pox[0];
-  pox[1] = cache.pox[1];
-  pox[2] = cache.pox[2];
-}
-
-double *surface_integral::get_shellf_buffer(int num_var)
-{
-  double *&buffer = shellf_cache[num_var];
-  if (!buffer)
-    buffer = new double[n_tot * num_var];
-  return buffer;
-}
-
-void surface_integral::release_cached_buffers()
-{
-  for (map<double, SpherePointCache>::iterator it = sphere_point_cache.begin(); it != sphere_point_cache.end(); ++it)
-  {
-    delete[] it->second.pox[0];
-    delete[] it->second.pox[1];
-    delete[] it->second.pox[2];
-    it->second.pox[0] = it->second.pox[1] = it->second.pox[2] = 0;
-  }
-  sphere_point_cache.clear();
-
-  for (map<int, double *>::iterator it = shellf_cache.begin(); it != shellf_cache.end(); ++it)
-    delete[] it->second;
-  shellf_cache.clear();
-}
-//|----------------------------------------------------------------
-//  spin weighted spinw component of psi4, general routine
-//  l takes from spinw to maxl; m takes from -l to l
+surface_integral::~surface_integral()
+{
+  delete[] nx_g;
+  delete[] ny_g;
+  delete[] nz_g;
+  delete[] arcostheta;
+#ifdef GaussInt
+  delete[] wtcostheta;
+#endif
+}
+//|----------------------------------------------------------------
+//  spin weighted spinw component of psi4, general routine
+//  l takes from spinw to maxl; m takes from -l to l
 //|----------------------------------------------------------------
 void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
@@ -254,9 +209,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }

  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
@@ -272,7 +234,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }

-  double *shellf = get_shellf_buffer(InList);
+  double *shellf;
+  shellf = new double[n_tot * InList];

  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);

@@ -412,10 +375,14 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *

  //|------= Free memory.

-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
 void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
@@ -435,11 +402,19 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
-
-  double *shellf = get_shellf_buffer(InList);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];

  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");

@@ -602,10 +577,14 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *

  //|------= Free memory.

-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
 //|----------------------------------------------------------------
 //  for shell patch
 //|----------------------------------------------------------------
@@ -618,11 +597,19 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }

-  double *shellf = get_shellf_buffer(InList);
+  double *shellf;
+  shellf = new double[n_tot * InList];

  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);

@@ -2583,8 +2570,12 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  Rout[5] = sy;
  Rout[6] = sz;

-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
 void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
@@ -2646,11 +2637,19 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  DG_List->insert(Ayz);
  DG_List->insert(Azz);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
-
-  double *shellf = get_shellf_buffer(InList);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];

  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
@@ -2840,8 +2839,12 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  Rout[5] = sy;
  Rout[6] = sz;

-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
 //|----------------------------------------------------------------
 //  for shell patch
 //|----------------------------------------------------------------
--- a/AMSS_NCKU_source/surface_integral.h
+++ b/AMSS_NCKU_source/surface_integral.h
@@ -20,41 +20,25 @@ using namespace std;
 #include "cgh.h"
 #include "ShellPatch.h"
 #include "NullShellPatch.h"
-#include "NullShellPatch2.h"
-#include "var.h"
-#include "monitor.h"
-#include <map>
+#include "NullShellPatch2.h"
+#include "var.h"
+#include "monitor.h"

 class surface_integral
 {

-private:
-	struct SpherePointCache
-	{
-		double *pox[3];
-		SpherePointCache()
-		{
-			pox[0] = pox[1] = pox[2] = 0;
-		}
-	};
-
-	int Symmetry, factor;
-	int N_theta, N_phi; // Number of points in Theta & Phi directions
-	double dphi, dcostheta;
-	double *arcostheta, *wtcostheta;
-	int n_tot; // size of arrays
-
-	double *nx_g, *ny_g, *nz_g; // global list of unit normals
-	int myrank, cpusize;
-	map<double, SpherePointCache> sphere_point_cache;
-	map<int, double *> shellf_cache;
-
-	void get_surface_points(double rex, double **pox);
-	double *get_shellf_buffer(int num_var);
-	void release_cached_buffers();
-
-public:
-	surface_integral(int iSymmetry);
+private:
+	int Symmetry, factor;
+	int N_theta, N_phi; // Number of points in Theta & Phi directions
+	double dphi, dcostheta;
+	double *arcostheta, *wtcostheta;
+	int n_tot; // size of arrays
+
+	double *nx_g, *ny_g, *nz_g; // global list of unit normals
+	int myrank, cpusize;
+
+public:
+	surface_integral(int iSymmetry);
 	~surface_integral();

 	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -55,47 +55,43 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 BUILD_JOBS = 64


-##################################################################
+def build_abe_runtime_env():
+    """Inject OpenMP runtime settings only for the main ABE evolution run."""
+    runtime_env = os.environ.copy()
+    omp_threads = max(1, int(getattr(input_data, "OMP_Threads", 1)))
+    runtime_env["OMP_NUM_THREADS"] = str(omp_threads)
+    return runtime_env


-##################################################################
+def build_twopuncture_runtime_env():
+    """Let TwoPunctureABE use the runtime default instead of the ABE OMP override."""
+    runtime_env = os.environ.copy()
+    runtime_env.pop("OMP_NUM_THREADS", None)
+    runtime_env.pop("OMP_THREAD_LIMIT", None)
+    return runtime_env

-def prepare_gpu_runtime_env():
-    """
-    Create a user-private CUDA MPS environment for GPU runs.

-    On shared machines another user's daemon may already occupy the default
-    /tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
-    fail with cudaErrorMpsConnectionFailed.  Binding AMSS-NCKU to a private
-    pipe directory avoids cross-user interference.
-    """
-    env = os.environ.copy()
+def build_mpi_launch_args():
+    """Build optional host-distribution arguments for mpirun."""
+    hosts = list(getattr(input_data, "MPI_hosts", []))
+    ppn = int(getattr(input_data, "MPI_processes_per_node", 0))

-    pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
-    log_dir  = env.get("CUDA_MPS_LOG_DIRECTORY",  f"/tmp/amss-ncku-mps-log-{os.getuid()}")
+    if not hosts:
+        return ""

-    os.makedirs(pipe_dir, exist_ok=True)
-    os.makedirs(log_dir, exist_ok=True)
+    if ppn > 0:
+        expected = len(hosts) * ppn
+        if int(input_data.MPI_processes) != expected:
+            raise ValueError(
+                f"MPI_processes={input_data.MPI_processes} does not match "
+                f"len(MPI_hosts) * MPI_processes_per_node = {expected}"
+            )

-    env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
-    env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
+    launch_args = f"-hosts {','.join(hosts)}"
+    if ppn > 0:
+        launch_args += f" -ppn {ppn}"
+    return launch_args

-    control_socket = os.path.join(pipe_dir, "control")
-    if not os.path.exists(control_socket):
-        start = subprocess.run(
-            ["nvidia-cuda-mps-control", "-d"],
-            env=env,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-        )
-        if start.returncode != 0:
-            print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
-        else:
-            print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
-    else:
-        print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
-
-    return env

 ##################################################################

@@ -186,21 +182,27 @@ def run_ABE():
    print(                                                      )
    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                      )
+    print( f" MPI processes = {input_data.MPI_processes}, OMP threads per process = {max(1, int(getattr(input_data, 'OMP_Threads', 1)))}" )
+    if getattr(input_data, "MPI_hosts", []):
+        print( f" MPI hosts = {getattr(input_data, 'MPI_hosts', [])}, MPI ranks per node = {int(getattr(input_data, 'MPI_processes_per_node', 0))}" )
+        print( " Multi-node runs require the working directory to be visible on all MPI hosts. " )
+    print(                                                                                                      )

    ## Define the command to run; cast other values to strings as needed
+    mpi_launch_args = build_mpi_launch_args()
    
-    run_env = None
-
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun "
+        if mpi_launch_args:
+            mpi_command += mpi_launch_args + " "
+        mpi_command += "-np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        run_env = prepare_gpu_runtime_env()
-        if int(input_data.MPI_processes) == 1:
-            mpi_command = "./ABEGPU"
-        else:
-            mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun "
+        if mpi_launch_args:
+            mpi_command += mpi_launch_args + " "
+        mpi_command += "-np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
 
    ## Execute the MPI command and stream output
@@ -210,7 +212,7 @@ def run_ABE():
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        text=True,
-        env=run_env,
+        env=build_abe_runtime_env(),
    )

    ## Write ABE run output to file while printing to stdout
@@ -251,7 +253,14 @@ def run_TwoPunctureABE():
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"

    ## Execute the command with subprocess.Popen and stream output
-    TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    TwoPuncture_process = subprocess.Popen(
+        TwoPuncture_command,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        env=build_twopuncture_runtime_env(),
+    )

    ## Write TwoPunctureABE run output to file while printing to stdout
    with open(TwoPuncture_command_outfile, 'w') as file0:  
--- a/setup.py
+++ b/setup.py
@@ -65,10 +65,14 @@ def print_input_data( File_directory ):
        
    print( "------------------------------------------------------------------------------------------" )
    print(                                                                                           )
-    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation "                  )
-    print(                                                                                           )
-    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes   ) 
-    print(                                                                                           )
+    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation "                  )
+    print(                                                                                           )
+    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes   )
+    print( " The number of OMP threads per MPI process         = ", input_data.OMP_Threads           )
+    if getattr(input_data, "MPI_hosts", []):
+        print( " The MPI host list in the AMSS-NCKU simulation   = ", input_data.MPI_hosts             )
+        print( " The number of MPI ranks launched per host       = ", input_data.MPI_processes_per_node )
+    print(                                                                                           )
    print( " The form of computational equation  = ",            input_data.Equation_Class           )
    print( " The initial data in this simulation = ",            input_data.Initial_Data_Method      )
    print(                                                                                           )
@@ -140,10 +144,14 @@ def print_input_data( File_directory ):
    file0    = open(filepath, 'w')
    
    print(                                                                                              file=file0 )
-    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation ",                    file=file0 )
-    print(                                                                                              file=file0 )
-    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes,     file=file0 ) 
-    print(                                                                                              file=file0 )
+    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation ",                    file=file0 )
+    print(                                                                                              file=file0 )
+    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes,     file=file0 )
+    print( " The number of OMP threads per MPI process         = ", input_data.OMP_Threads,             file=file0 )
+    if getattr(input_data, "MPI_hosts", []):
+        print( " The MPI host list in the AMSS-NCKU simulation   = ", input_data.MPI_hosts,               file=file0 )
+        print( " The number of MPI ranks launched per host       = ", input_data.MPI_processes_per_node,   file=file0 )
+    print(                                                                                              file=file0 )
    print( " The form of computational equation  = ",            input_data.Equation_Class,             file=file0 )
    print( " The initial data in this simulation = ",            input_data.Initial_Data_Method,        file=file0 )
    print(                                                                                              file=file0 )
Author	SHA1	Message	Date
CGH0S7	d96ca6ed2a	Add two-node MPI launch configuration	2026-03-30 21:13:46 +08:00
CGH0S7	60ad63e8cc	Isolate TwoPuncture from ABE OMP settings	2026-03-30 21:00:20 +08:00
CGH0S7	087d034ee3	Use wall time for timestep logging	2026-03-30 20:38:41 +08:00
CGH0S7	5f664716ab	Enable OpenMP task parallelism for C kernels	2026-03-30 20:34:34 +08:00