prolong3: 减少Z-pass 冗余计算

prolong3：提升cache命中率
修改prolong
2026-03-02 21:20:49 +08:00 · 2026-03-02 10:31:46 +08:00 · 2026-03-02 02:01:07 +08:00 · 2026-03-02 01:16:10 +08:00
25 changed files with 1877 additions and 7926 deletions
--- a/AMSS_NCKU_source/ABE.C
+++ b/AMSS_NCKU_source/ABE.C
@@ -23,20 +23,22 @@ using namespace std;
 #include <mpi.h>

 #include "misc.h"
-#include "macrodef.h"
-#ifdef USE_GPU
-extern void bssn_cuda_dump_stage_profile();
-#endif
+#include "macrodef.h"

 #ifndef ABEtype
 #error "not define ABEtype"
 #endif

-#if (ABEtype == 0)
-#include "bssn_class.h"
-
-#elif (ABEtype == 1)
-#include "bssnEScalar_class.h"
+#if (ABEtype == 0)
+
+#ifdef USE_GPU
+#include "bssn_gpu_class.h"
+#else
+#include "bssn_class.h"
+#endif
+
+#elif (ABEtype == 1)
+#include "bssnEScalar_class.h"

 #elif (ABEtype == 2)
 #include "Z4c_class.h"
@@ -472,13 +474,10 @@ int main(int argc, char *argv[])
            cout << endl;
      }

-	      ADM->Evolve(Steps);
-#ifdef USE_GPU
-	      bssn_cuda_dump_stage_profile();
-#endif
-	
-	      if (myrank == 0)
-	      {
+      ADM->Evolve(Steps);
+
+      if (myrank == 0)
+      {
            cout << endl;
            cout << " Total Evolve Time: "  << MPI_Wtime() - End_clock   << " seconds!" << endl;
            cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl;
--- a/AMSS_NCKU_source/Block.C
+++ b/AMSS_NCKU_source/Block.C
@@ -9,12 +9,8 @@
 #include <new>
 using namespace std;

-#include "Block.h"
-#include "misc.h"
-#ifdef USE_GPU
-#include "bssn_gpu.h"
-#include "bssn_cuda_ops.h"
-#endif
+#include "Block.h"
+#include "misc.h"

 Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fngfsi, int levi, const int cgpui) : rank(ranki), ingfs(ingfsi), fngfs(fngfsi), lev(levi), cgpu(cgpui)
 {
@@ -99,19 +95,14 @@ Block::Block(int DIM, int *shapei, double *bboxi, int ranki, int ingfsi, int fng
  }
 #endif
 }
-Block::~Block()
-{
-  int myrank;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-  if (myrank == rank)
-  {
-#ifdef USE_GPU
-    bssn_gpu_clear_cached_device_buffers();
-    bssn_cuda_release_rk4_caches();
-    bssn_cuda_release_interp_caches();
-#endif
-    for (int i = 0; i < dim; i++)
-      delete[] X[i];
+Block::~Block()
+{
+  int myrank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if (myrank == rank)
+  {
+    for (int i = 0; i < dim; i++)
+      delete[] X[i];
    for (int i = 0; i < ingfs; i++)
      free(igfs[i]);
    delete[] igfs;
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -2,558 +2,23 @@
 #include <iostream>
 #include <iomanip>
 #include <fstream>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <cmath>
-#include <new>
-#include <map>
-#include <vector>
-using namespace std;
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <new>
+using namespace std;

-#include "misc.h"
-#include "MPatch.h"
-#include "Parallel.h"
-#include "fmisc.h"
-#include "bssn_cuda_ops.h"
-#ifdef INTERP_LB_PROFILE
-#include "interp_lb_profile.h"
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-extern int bssn_cuda_interp_points_batch(const int *ex,
-                                         const double *X, const double *Y, const double *Z,
-                                         const double *const *fields,
-                                         const double *soa_flat,
-                                         int num_var,
-                                         const double *px, const double *py, const double *pz,
-                                         int num_points,
-                                         int ordn,
-                                         int symmetry,
-                                         double *out) __attribute__((weak));
-#endif
-
-namespace
-{
-struct InterpVarDesc
-{
-  int sgfn;
-  double soa[dim];
-};
-
-struct InterpPlanKey
-{
-  const Patch *patch;
-  const double *x;
-  const double *y;
-  const double *z;
-  int NN;
-  int Symmetry;
-  int myrank;
-};
-
-struct InterpPlanKeyLess
-{
-  bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const
-  {
-    if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch;
-    if (lhs.x != rhs.x) return lhs.x < rhs.x;
-    if (lhs.y != rhs.y) return lhs.y < rhs.y;
-    if (lhs.z != rhs.z) return lhs.z < rhs.z;
-    if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN;
-    if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry;
-    return lhs.myrank < rhs.myrank;
-  }
-};
-
-struct CachedInterpPlan
-{
-  int nblocks;
-  vector<int> owner_rank;
-  vector<int> owner_block;
-  vector<vector<int> > block_points;
-  vector<vector<double> > block_px;
-  vector<vector<double> > block_py;
-  vector<vector<double> > block_pz;
-
-  CachedInterpPlan() : nblocks(0) {}
-};
-
-struct CachedInterpPlanEntry
-{
-  bool valid;
-  InterpPlanKey key;
-  vector<double> xvals;
-  vector<double> yvals;
-  vector<double> zvals;
-  CachedInterpPlan plan;
-
-  CachedInterpPlanEntry() : valid(false) {}
-};
-
-struct InterpBlockView
-{
-  Block *bp;
-  double llb[dim];
-  double uub[dim];
-};
-
-struct BlockBinIndex
-{
-  int bins[dim];
-  double lo[dim];
-  double inv[dim];
-  vector<InterpBlockView> views;
-  vector<vector<int>> bin_to_blocks;
-  bool valid;
-
-  BlockBinIndex() : valid(false)
-  {
-    for (int i = 0; i < dim; i++)
-    {
-      bins[i] = 1;
-      lo[i] = 0.0;
-      inv[i] = 0.0;
-    }
-  }
-};
-
-inline int clamp_int(int v, int lo, int hi)
-{
-  return (v < lo) ? lo : ((v > hi) ? hi : v);
-}
-
-inline int coord_to_bin(double x, double lo, double inv, int nb)
-{
-  if (nb <= 1 || inv <= 0.0)
-    return 0;
-  int b = int(floor((x - lo) * inv));
-  return clamp_int(b, 0, nb - 1);
-}
-
-inline int bin_loc(const BlockBinIndex &index, int b0, int b1, int b2)
-{
-  return b0 + index.bins[0] * (b1 + index.bins[1] * b2);
-}
-
-inline bool point_in_block_view(const InterpBlockView &view, const double *pox, const double *DH)
-{
-  for (int i = 0; i < dim; i++)
-  {
-    if (pox[i] - view.llb[i] < -DH[i] / 2 || pox[i] - view.uub[i] > DH[i] / 2)
-      return false;
-  }
-  return true;
-}
-
-void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
-{
-  index = BlockBinIndex();
-
-  MyList<Block> *Bp = patch->blb;
-  while (Bp)
-  {
-    Block *BP = Bp->data;
-    InterpBlockView view;
-    view.bp = BP;
-    for (int i = 0; i < dim; i++)
-    {
-#ifdef Vertex
-#ifdef Cell
-#error Both Cell and Vertex are defined
+#include "misc.h"
+#include "MPatch.h"
+#include "Parallel.h"
+#include "fmisc.h"
+#ifdef INTERP_LB_PROFILE
+#include "interp_lb_profile.h"
 #endif
-      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
-      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
-#else
-#ifdef Cell
-      view.llb[i] = (feq(BP->bbox[i], patch->bbox[i], DH[i] / 2)) ? BP->bbox[i] + patch->lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
-      view.uub[i] = (feq(BP->bbox[dim + i], patch->bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - patch->uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
-#else
-#error Not define Vertex nor Cell
-#endif
-#endif
-    }
-    index.views.push_back(view);
-    if (Bp == patch->ble)
-      break;
-    Bp = Bp->next;
-  }

-  const int nblocks = int(index.views.size());
-  if (nblocks <= 0)
-    return;
-
-  int bins_1d = int(ceil(pow(double(nblocks), 1.0 / 3.0)));
-  bins_1d = clamp_int(bins_1d, 1, 32);
-  for (int i = 0; i < dim; i++)
-  {
-    index.bins[i] = bins_1d;
-    index.lo[i] = patch->bbox[i] + patch->lli[i] * DH[i];
-    const double hi = patch->bbox[dim + i] - patch->uui[i] * DH[i];
-    if (hi > index.lo[i] && bins_1d > 1)
-      index.inv[i] = bins_1d / (hi - index.lo[i]);
-    else
-      index.inv[i] = 0.0;
-  }
-
-  index.bin_to_blocks.resize(index.bins[0] * index.bins[1] * index.bins[2]);
-
-  for (int bi = 0; bi < nblocks; bi++)
-  {
-    const InterpBlockView &view = index.views[bi];
-    int bmin[dim], bmax[dim];
-    for (int d = 0; d < dim; d++)
-    {
-      const double low = view.llb[d] - DH[d] / 2;
-      const double up = view.uub[d] + DH[d] / 2;
-      bmin[d] = coord_to_bin(low, index.lo[d], index.inv[d], index.bins[d]);
-      bmax[d] = coord_to_bin(up, index.lo[d], index.inv[d], index.bins[d]);
-      if (bmax[d] < bmin[d])
-      {
-        int t = bmin[d];
-        bmin[d] = bmax[d];
-        bmax[d] = t;
-      }
-    }
-
-    for (int bz = bmin[2]; bz <= bmax[2]; bz++)
-      for (int by = bmin[1]; by <= bmax[1]; by++)
-        for (int bx = bmin[0]; bx <= bmax[0]; bx++)
-          index.bin_to_blocks[bin_loc(index, bx, by, bz)].push_back(bi);
-  }
-
-  index.valid = true;
-}
-
-int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
-{
-  if (!index.valid)
-    return -1;
-
-  const int bx = coord_to_bin(pox[0], index.lo[0], index.inv[0], index.bins[0]);
-  const int by = coord_to_bin(pox[1], index.lo[1], index.inv[1], index.bins[1]);
-  const int bz = coord_to_bin(pox[2], index.lo[2], index.inv[2], index.bins[2]);
-  const vector<int> &cand = index.bin_to_blocks[bin_loc(index, bx, by, bz)];
-
-  for (size_t ci = 0; ci < cand.size(); ci++)
-  {
-    const int bi = cand[ci];
-    if (point_in_block_view(index.views[bi], pox, DH))
-      return bi;
-  }
-
-  // Fallback to full scan for numerical edge cases around bin boundaries.
-  for (size_t bi = 0; bi < index.views.size(); bi++)
-    if (point_in_block_view(index.views[bi], pox, DH))
-      return int(bi);
-
-  return -1;
-}
-
-void collect_interp_vars(MyList<var> *VarList, vector<InterpVarDesc> &vars)
-{
-  vars.clear();
-  MyList<var> *varl = VarList;
-  while (varl)
-  {
-    InterpVarDesc desc;
-    desc.sgfn = varl->data->sgfn;
-    for (int d = 0; d < dim; ++d)
-      desc.soa[d] = varl->data->SoA[d];
-    vars.push_back(desc);
-    varl = varl->next;
-  }
-}
-
-bool should_try_cuda_interp(int ordn, int num_points, int num_var)
-{
-#if defined(__GNUC__) || defined(__clang__)
-  if (!bssn_cuda_interp_points_batch)
-    return false;
-#else
-  return false;
-#endif
-  if (ordn != 6)
-    return false;
-  if (num_points < 32)
-    return false;
-  return num_points * num_var >= 256;
-}
-
-CachedInterpPlanEntry &interp_plan_cache_entry()
-{
-  static CachedInterpPlanEntry cache;
-  return cache;
-}
-
-bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
-{
-  return lhs.patch == rhs.patch &&
-         lhs.NN == rhs.NN &&
-         lhs.Symmetry == rhs.Symmetry &&
-         lhs.myrank == rhs.myrank;
-}
-
-bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
-{
-  if (static_cast<int>(cache.xvals.size()) != NN ||
-      static_cast<int>(cache.yvals.size()) != NN ||
-      static_cast<int>(cache.zvals.size()) != NN)
-    return false;
-
-  for (int j = 0; j < NN; ++j)
-  {
-    if (cache.xvals[j] != XX[0][j] ||
-        cache.yvals[j] != XX[1][j] ||
-        cache.zvals[j] != XX[2][j])
-      return false;
-  }
-  return true;
-}
-
-CachedInterpPlan &get_cached_interp_plan(Patch *patch,
-                                         int NN, double **XX,
-                                         int Symmetry, int myrank,
-                                         const double *DH,
-                                         const BlockBinIndex &block_index,
-                                         bool report_bounds_here,
-                                         bool allow_missing_points)
-{
-  InterpPlanKey key;
-  key.patch = patch;
-  key.x = XX[0];
-  key.y = XX[1];
-  key.z = XX[2];
-  key.NN = NN;
-  key.Symmetry = Symmetry;
-  key.myrank = myrank;
-
-  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
-  if (cache.valid &&
-      same_interp_plan_key(cache.key, key) &&
-      same_interp_plan_points(cache, NN, XX) &&
-      cache.plan.nblocks == static_cast<int>(block_index.views.size()))
-    return cache.plan;
-
-  cache.valid = true;
-  cache.key = key;
-  cache.xvals.assign(XX[0], XX[0] + NN);
-  cache.yvals.assign(XX[1], XX[1] + NN);
-  cache.zvals.assign(XX[2], XX[2] + NN);
-  cache.plan = CachedInterpPlan();
-  CachedInterpPlan &plan = cache.plan;
-  plan.nblocks = static_cast<int>(block_index.views.size());
-  plan.owner_rank.assign(NN, -1);
-  plan.owner_block.assign(NN, -1);
-  plan.block_points.resize(plan.nblocks);
-  plan.block_px.resize(plan.nblocks);
-  plan.block_py.resize(plan.nblocks);
-  plan.block_pz.resize(plan.nblocks);
-
-  for (int j = 0; j < NN; ++j)
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; ++i)
-    {
-      pox[i] = XX[i][j];
-      if (report_bounds_here &&
-          (XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] ||
-           XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; ++k)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
-    {
-      Block *BP = block_index.views[block_i].bp;
-      plan.owner_rank[j] = BP->rank;
-      plan.owner_block[j] = block_i;
-      if (BP->rank == myrank)
-      {
-        plan.block_points[block_i].push_back(j);
-        plan.block_px[block_i].push_back(XX[0][j]);
-        plan.block_py[block_i].push_back(XX[1][j]);
-        plan.block_pz[block_i].push_back(XX[2][j]);
-      }
-    }
-  }
-
-  if (!allow_missing_points && report_bounds_here)
-  {
-    for (int j = 0; j < NN; ++j)
-    {
-      if (plan.owner_rank[j] >= 0)
-        continue;
-      cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int d = 0; d < dim; ++d)
-      {
-        cout << XX[d][j];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")";
-      }
-      cout << " on Patch (";
-      for (int d = 0; d < dim; ++d)
-      {
-        cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")--";
-      }
-      cout << "(";
-      for (int d = 0; d < dim; ++d)
-      {
-        cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  return plan;
-}
-
-void release_interp_plan_cache_internal()
-{
-  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
-  cache.valid = false;
-  cache.xvals.clear();
-  cache.yvals.clear();
-  cache.zvals.clear();
-  cache.plan = CachedInterpPlan();
-}
-
-bool run_cuda_interp_for_block(Block *BP,
-                               const vector<InterpVarDesc> &vars,
-                               const vector<int> &point_ids,
-                               const vector<double> &px,
-                               const vector<double> &py,
-                               const vector<double> &pz,
-                               double *Shellf,
-                               int num_var,
-                               int ordn,
-                               int Symmetry)
-{
-  if (!should_try_cuda_interp(ordn, static_cast<int>(point_ids.size()), num_var))
-    return false;
-
-  vector<const double *> field_ptrs(num_var);
-  vector<double> soa_flat(3 * num_var);
-  for (int v = 0; v < num_var; ++v)
-  {
-    field_ptrs[v] = BP->fgfs[vars[v].sgfn];
-    for (int d = 0; d < dim; ++d)
-      soa_flat[3 * v + d] = vars[v].soa[d];
-  }
-
-  const int npts = static_cast<int>(point_ids.size());
-  vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
-  if (bssn_cuda_interp_points_batch(BP->shape,
-                                    BP->X[0], BP->X[1], BP->X[2],
-                                    field_ptrs.data(),
-                                    soa_flat.data(),
-                                    num_var,
-                                    px.data(), py.data(), pz.data(),
-                                    npts,
-                                    ordn,
-                                    Symmetry,
-                                    out.data()) != 0)
-  {
-    return false;
-  }
-
-  for (int p = 0; p < npts; ++p)
-  {
-    const int j = point_ids[p];
-    memcpy(Shellf + j * num_var, out.data() + p * num_var, sizeof(double) * num_var);
-  }
-  return true;
-}
-
-void run_cpu_interp_for_block(Block *BP,
-                              const vector<InterpVarDesc> &vars,
-                              const vector<int> &point_ids,
-                              const vector<double> &px,
-                              const vector<double> &py,
-                              const vector<double> &pz,
-                              double *Shellf,
-                              int num_var,
-                              int ordn,
-                              int Symmetry)
-{
-  for (size_t p = 0; p < point_ids.size(); ++p)
-  {
-    const int j = point_ids[p];
-    double x = px[p];
-    double y = py[p];
-    double z = pz[p];
-    int ordn_local = ordn;
-    int symmetry_local = Symmetry;
-    for (int v = 0; v < num_var; ++v)
-    {
-      f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
-                      BP->fgfs[vars[v].sgfn], Shellf[j * num_var + v],
-                      x, y, z, ordn_local, const_cast<double *>(vars[v].soa), symmetry_local);
-    }
-  }
-}
-
-void interpolate_owned_points(MyList<var> *VarList,
-                              double *Shellf, int Symmetry,
-                              int ordn,
-                              const BlockBinIndex &block_index,
-                              const CachedInterpPlan &plan)
-{
-  vector<InterpVarDesc> vars;
-  collect_interp_vars(VarList, vars);
-  const int num_var = static_cast<int>(vars.size());
-
-  for (size_t bi = 0; bi < plan.block_points.size(); ++bi)
-  {
-    if (plan.block_points[bi].empty())
-      continue;
-
-    Block *BP = block_index.views[bi].bp;
-    bool done = run_cuda_interp_for_block(BP, vars,
-                                          plan.block_points[bi],
-                                          plan.block_px[bi],
-                                          plan.block_py[bi],
-                                          plan.block_pz[bi],
-                                          Shellf, num_var, ordn, Symmetry);
-    if (!done)
-      run_cpu_interp_for_block(BP, vars,
-                               plan.block_points[bi],
-                               plan.block_px[bi],
-                               plan.block_py[bi],
-                               plan.block_pz[bi],
-                               Shellf, num_var, ordn, Symmetry);
-  }
-}
-} // namespace
-
-void patch_release_interp_plan_cache()
-{
-  release_interp_plan_cache_internal();
-}
-
-Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
-{
+Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
+{

  int hbuffer_width = buffer_width;
  if (lev == 0)
@@ -895,15 +360,91 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
-  const int *owner_rank = plan.owner_rank.data();
-
-  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
+  // owner_rank[j] records which MPI rank owns point j
+  // All ranks traverse the same block list so they all agree on ownership
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  for (int j = 0; j < NN; j++) // run along points
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp) // run along Blocks
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          //---> interpolation
+          varl = VarList;
+          int k = 0;
+          while (varl) // run along variables
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }

  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
@@ -958,8 +499,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
      MPI_Bcast(Shellf + jstart * num_var, count, MPI_DOUBLE, cur_owner, MPI_COMM_WORLD);
    }
  }
-
-}
+
+  delete[] owner_rank;
+}
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
                          double *Shellf, int Symmetry,
@@ -987,22 +529,133 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
-  const int *owner_rank = plan.owner_rank.data();
-
-  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
+  // owner_rank[j] records which MPI rank owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  // --- Interpolation phase (identical to original) ---
+  for (int j = 0; j < NN; j++)
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp)
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          varl = VarList;
+          int k = 0;
+          while (varl)
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }

 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
  double t_interp_local = t_interp_end - t_interp_start;
 #endif

-  // --- Targeted point-to-point communication phase ---
+  // --- Error check for unfound points ---
+  for (int j = 0; j < NN; j++)
+  {
+    if (owner_rank[j] < 0 && myrank == 0)
+    {
+      cout << "ERROR: Patch::Interp_Points fails to find point (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << XX[d][j];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")";
+      }
+      cout << " on Patch (";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[d] << "+" << lli[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")--";
+      }
+      cout << "(";
+      for (int d = 0; d < dim; d++)
+      {
+        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  // --- Targeted point-to-point communication phase ---
  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
  int *consumer_rank = new int[NN];
  {
@@ -1119,8 +772,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] send_offset;
  delete[] recv_offset;
  delete[] send_count;
-  delete[] recv_count;
-  delete[] consumer_rank;
+  delete[] recv_count;
+  delete[] consumer_rank;
+  delete[] owner_rank;

 #ifdef INTERP_LB_PROFILE
  {
@@ -1168,20 +822,95 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // Build global-to-local rank translation for Comm_here
-  MPI_Group world_group, local_group;
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
-  MPI_Comm_group(Comm_here, &local_group);
+  // owner_rank[j] stores the global rank that owns point j
+  int *owner_rank;
+  owner_rank = new int[NN];
+  for (int j = 0; j < NN; j++)
+    owner_rank[j] = -1;

-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true);
-  const int *owner_rank = plan.owner_rank.data();
-
-  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);
+  // Build global-to-local rank translation for Comm_here
+  MPI_Group world_group, local_group;
+  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+  MPI_Comm_group(Comm_here, &local_group);
+
+  double DH[dim], llb[dim], uub[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+
+  for (int j = 0; j < NN; j++) // run along points
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; i++)
+    {
+      pox[i] = XX[i][j];
+      if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; k++)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    MyList<Block> *Bp = blb;
+    bool notfind = true;
+    while (notfind && Bp) // run along Blocks
+    {
+      Block *BP = Bp->data;
+
+      bool flag = true;
+      for (int i = 0; i < dim; i++)
+      {
+#ifdef Vertex
+#ifdef Cell
+#error Both Cell and Vertex are defined
+#endif
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + (ghost_width - 0.5) * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - (ghost_width - 0.5) * DH[i];
+#else
+#ifdef Cell
+        llb[i] = (feq(BP->bbox[i], bbox[i], DH[i] / 2)) ? BP->bbox[i] + lli[i] * DH[i] : BP->bbox[i] + ghost_width * DH[i];
+        uub[i] = (feq(BP->bbox[dim + i], bbox[dim + i], DH[i] / 2)) ? BP->bbox[dim + i] - uui[i] * DH[i] : BP->bbox[dim + i] - ghost_width * DH[i];
+#else
+#error Not define Vertex nor Cell
+#endif
+#endif
+        if (XX[i][j] - llb[i] < -DH[i] / 2 || XX[i][j] - uub[i] > DH[i] / 2)
+        {
+          flag = false;
+          break;
+        }
+      }
+
+      if (flag)
+      {
+        notfind = false;
+        owner_rank[j] = BP->rank;
+        if (myrank == BP->rank)
+        {
+          //---> interpolation
+          varl = VarList;
+          int k = 0;
+          while (varl) // run along variables
+          {
+            f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
+                            pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
+            varl = varl->next;
+            k++;
+          }
+        }
+      }
+      if (Bp == ble)
+        break;
+      Bp = Bp->next;
+    }
+  }

  // Collect unique global owner ranks and translate to local ranks in Comm_here
  // Then broadcast each owner's points via MPI_Bcast on Comm_here
@@ -1209,9 +938,10 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }

-  MPI_Group_free(&world_group);
-  MPI_Group_free(&local_group);
-}
+  MPI_Group_free(&world_group);
+  MPI_Group_free(&local_group);
+  delete[] owner_rank;
+}
 void Patch::checkBlock()
 {
  int myrank;
--- a/AMSS_NCKU_source/MPatch.h
+++ b/AMSS_NCKU_source/MPatch.h
@@ -8,7 +8,7 @@
 #include "var.h"
 #include "macrodef.h" //need dim here; Vertex or Cell; ghost_width

-class Patch
+class Patch
 {

 public:
@@ -50,8 +50,6 @@ public:
                         double *Shellf, int Symmetry, MPI_Comm Comm_here);
   void Find_Maximum(MyList<var> *VarList, double *XX,
                     double *Shellf, MPI_Comm Comm_here);
-};
-
-void patch_release_interp_plan_cache();
-
-#endif /* PATCH_H */
+};
+
+#endif /* PATCH_H */
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
--- a/AMSS_NCKU_source/Parallel.h
+++ b/AMSS_NCKU_source/Parallel.h
@@ -89,12 +89,9 @@ namespace Parallel
  void transfermix(MyList<gridseg> **src, MyList<gridseg> **dst,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
                   int Symmetry);
-  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
-  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry, const char *context);
-  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
-  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
-  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
-  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, const char *context);
+  void Sync(Patch *Pat, MyList<var> *VarList, int Symmetry);
+  void Sync(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);
+  void Sync_merged(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry);

  struct SyncCache {
    bool valid;
@@ -108,13 +105,9 @@ namespace Parallel
    int *send_buf_caps;
    int *recv_buf_caps;
    MPI_Request *reqs;
-    MPI_Status *stats;
-    int max_reqs;
-    bool lengths_valid;
-    int lengths_var_count;
-    int *tc_req_node;
-    int *tc_req_is_recv;
-    int *tc_completed;
+    MPI_Status *stats;
+    int max_reqs;
+    bool lengths_valid;
    SyncCache();
    void invalidate();
    void destroy();
@@ -125,20 +118,16 @@ namespace Parallel
                       MyList<var> *VarList1, MyList<var> *VarList2,
                       int Symmetry, SyncCache &cache);

-  struct AsyncSyncState {
-    int req_no;
-    bool active;
-    int mpi_tag;
-    int *req_node;
-    int *req_is_recv;
-    int pending_recv;
-    AsyncSyncState() : req_no(0), active(false), mpi_tag(0), req_node(0), req_is_recv(0), pending_recv(0) {}
-  };
+  struct AsyncSyncState {
+    int req_no;
+    bool active;
+    AsyncSyncState() : req_no(0), active(false) {}
+  };

-  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
-                  SyncCache &cache, AsyncSyncState &state);
-  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
-                   MyList<var> *VarList, int Symmetry, bool unpack_to_host = true);
+  void Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry,
+                  SyncCache &cache, AsyncSyncState &state);
+  void Sync_finish(SyncCache &cache, AsyncSyncState &state,
+                   MyList<var> *VarList, int Symmetry);
  void OutBdLow2Hi(Patch *Patc, Patch *Patf,
                   MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /* target */,
                   int Symmetry);
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -14,8 +14,7 @@ using namespace std;
 #include <string.h>
 #endif

-#include <time.h>
-#include <unistd.h>
+#include <time.h>

 #include "macrodef.h"
 #include "misc.h"
@@ -29,30 +28,20 @@ using namespace std;
 #include "rungekutta4_rout.h"
 #include "sommerfeld_rout.h"
 #include "getnp4.h"
-#include "shellfunctions.h"
-#include "parameters.h"
-#ifdef USE_GPU
-#include "bssn_macro.h"
-#include "bssn_gpu.h"
-#endif
+#include "shellfunctions.h"
+#include "parameters.h"

 #ifdef With_AHF
 #include "derivatives.h"
 #include "myglobal.h"
 #endif

-#include "perf.h"
-
-#include "derivatives.h"
-#include "ricci_gamma.h"
-
-// Compile-time switch for per-timestep memory usage collection/printing.
-// Default is OFF to reduce overhead in production runs.
-#ifndef BSSN_ENABLE_MEM_USAGE_LOG
-#define BSSN_ENABLE_MEM_USAGE_LOG 0
-#endif
-
-//================================================================================================
+#include "perf.h"
+
+#include "derivatives.h"
+#include "ricci_gamma.h"
+
+//================================================================================================

 // define bssn_class

@@ -745,12 +734,9 @@ void bssn_class::Initialize()
  // Initialize sync caches (per-level, for predictor and corrector)
  sync_cache_pre = new Parallel::SyncCache[GH->levels];
  sync_cache_cor = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
-  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
-  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
-  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
-  sync_cache_psi4 = new Parallel::SyncCache[GH->levels];
-}
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
+}

 //================================================================================================

@@ -762,8 +748,8 @@ void bssn_class::Initialize()

 //================================================================================================

-bssn_class::~bssn_class()
-{
+bssn_class::~bssn_class()
+{
 #ifdef With_AHF
  AHList->clearList();
  AHDList->clearList();
@@ -1020,30 +1006,12 @@ bssn_class::~bssn_class()
      sync_cache_rp_coarse[i].destroy();
    delete[] sync_cache_rp_coarse;
  }
-  if (sync_cache_rp_fine)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_rp_fine[i].destroy();
-    delete[] sync_cache_rp_fine;
-  }
-  if (sync_cache_restrict)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_restrict[i].destroy();
-    delete[] sync_cache_restrict;
-  }
-  if (sync_cache_outbd)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_outbd[i].destroy();
-    delete[] sync_cache_outbd;
-  }
-  if (sync_cache_psi4)
-  {
-    for (int i = 0; i < GH->levels; i++)
-      sync_cache_psi4[i].destroy();
-    delete[] sync_cache_psi4;
-  }
+  if (sync_cache_rp_fine)
+  {
+    for (int i = 0; i < GH->levels; i++)
+      sync_cache_rp_fine[i].destroy();
+    delete[] sync_cache_rp_fine;
+  }

  delete GH;
 #ifdef WithShell
@@ -1076,25 +1044,8 @@ bssn_class::~bssn_class()
  delete ConVMonitor;
  delete Waveshell;

-  delete CheckPoint;
-}
-
-void bssn_class::InvalidateSyncCaches()
-{
-  if (!GH)
-    return;
-
-  for (int il = 0; il < GH->levels; il++)
-  {
-    sync_cache_pre[il].invalidate();
-    sync_cache_cor[il].invalidate();
-    sync_cache_rp_coarse[il].invalidate();
-    sync_cache_rp_fine[il].invalidate();
-    sync_cache_restrict[il].invalidate();
-    sync_cache_outbd[il].invalidate();
-    sync_cache_psi4[il].invalidate();
-  }
-}
+  delete CheckPoint;
+}

 //================================================================================================

@@ -2077,10 +2028,9 @@ void bssn_class::Read_Ansorg()

 void bssn_class::Evolve(int Steps)
 {
-  clock_t prev_clock, curr_clock;
-  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
-  LastAnas = 0;
-  LastConsOut = 0;
+  clock_t prev_clock, curr_clock;
+  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
+  LastAnas = 0;
 #if 0
 //initial checkpoint for special uasge
     {
@@ -2177,10 +2127,8 @@ void bssn_class::Evolve(int Steps)
  #endif
  */
  
-#if BSSN_ENABLE_MEM_USAGE_LOG
-  perf bssn_perf;
-  size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;
-#endif
+  perf bssn_perf;
+  size_t current_min, current_avg, current_max, peak_min, peak_avg, peak_max;

  for (int lev = 0; lev < GH->levels; lev++)
    GH->Lt[lev] = PhysTime;
@@ -2265,7 +2213,7 @@ void bssn_class::Evolve(int Steps)
    GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
               SynchList_cor, OldStateList, StateList, SynchList_pre,
               fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
-    InvalidateSyncCaches();
+    for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif

 #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2274,23 +2222,21 @@ void bssn_class::Evolve(int Steps)
 //		fgt(PhysTime-dT_mon,StartTime,dT_mon/2),ErrorMonitor);
 #endif

-#if BSSN_ENABLE_MEM_USAGE_LOG
-    // Retrieve memory usage information used during computation; master process prints it
-    bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
-                          &peak_min, &peak_avg, &peak_max, nprocs);
-    if (myrank == 0)
-    {
-      printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, "
-             "peak %0.4lg/%0.4lg/%0.4lgMB\n",
-             (double)current_min / (1024.0 * 1024.0),
-             (double)current_avg / (1024.0 * 1024.0),
-             (double)current_max / (1024.0 * 1024.0),
-             (double)peak_min / (1024.0 * 1024.0),
-             (double)peak_avg / (1024.0 * 1024.0),
-             (double)peak_max / (1024.0 * 1024.0));
-      cout << endl;
-    }
-#endif
+    // Retrieve memory usage information used during computation; master process prints it
+    bssn_perf.MemoryUsage(&current_min, &current_avg, &current_max,
+                          &peak_min, &peak_avg, &peak_max, nprocs);
+    if (myrank == 0)
+    {
+      printf(" Memory usage: current %0.4lg/%0.4lg/%0.4lgMB, "
+             "peak %0.4lg/%0.4lg/%0.4lgMB\n",
+             (double)current_min / (1024.0 * 1024.0),
+             (double)current_avg / (1024.0 * 1024.0),
+             (double)current_max / (1024.0 * 1024.0),
+             (double)peak_min / (1024.0 * 1024.0),
+             (double)peak_avg / (1024.0 * 1024.0),
+             (double)peak_max / (1024.0 * 1024.0));
+      cout << endl;
+    }
    
    // Output puncture positions at each step
    if (myrank == 0)
@@ -2338,21 +2284,18 @@ void bssn_class::Evolve(int Steps)
    ////////////////////////////////////////////////////////////

    // When LastCheck >= CheckTime, perform runtime checks and output status data
-    if (LastCheck >= CheckTime)
-    {
-      LastCheck = 0;
-
-      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
+    if (LastCheck >= CheckTime)
+    {
+      LastCheck = 0;
+
+      CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
      CheckPoint->writecheck_cgh(PhysTime, GH);
 #ifdef WithShell
      CheckPoint->writecheck_sh(PhysTime, SH);
-#endif
-      CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
-    }
-
-    // Keep output/analysis phases aligned across ranks before the next coarse step.
-    MPI_Barrier(MPI_COMM_WORLD);
-  }
+#endif
+      CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
+    }
+  }
  /*
  #ifdef With_AHF
  // final apparent horizon finding
@@ -2486,7 +2429,7 @@ void bssn_class::RecursiveStep(int lev)
  if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  InvalidateSyncCaches();
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }

@@ -2665,7 +2608,7 @@ void bssn_class::ParallelStep()
  if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0,
                      SynchList_cor, OldStateList, StateList, SynchList_pre,
                      fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-  InvalidateSyncCaches();
+  for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }
 #endif
 }

@@ -2832,7 +2775,7 @@ void bssn_class::ParallelStep()
        if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0,
                            SynchList_cor, OldStateList, StateList, SynchList_pre,
                            fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor))
-        InvalidateSyncCaches();
+        for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

        //               a_stream.clear();
        //               a_stream.str("");
@@ -2847,7 +2790,7 @@ void bssn_class::ParallelStep()
      if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
                          SynchList_cor, OldStateList, StateList, SynchList_pre,
                          fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
-      InvalidateSyncCaches();
+      for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

      //               a_stream.clear();
      //               a_stream.str("");
@@ -2866,7 +2809,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          InvalidateSyncCaches();
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -2882,7 +2825,7 @@ void bssn_class::ParallelStep()
          if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0,
                              SynchList_cor, OldStateList, StateList, SynchList_pre,
                              fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor))
-          InvalidateSyncCaches();
+          for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); }

          //               a_stream.clear();
          //               a_stream.str("");
@@ -3071,14 +3014,9 @@ void bssn_class::RecursiveStep(int lev, int num) // in all 2^(lev+1)-1 steps

 #if (PSTR == 0)
 #if 1
-void bssn_class::Step(int lev, int YN)
-{
-#ifdef USE_GPU
-  Step_MainPath_GPU(lev, YN);
-  return;
-#endif
-
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
+void bssn_class::Step(int lev, int YN)
+{
+  setpbh(BH_num, Porg0, Mass, BH_num_input);

  double dT_lev = dT * pow(0.5, Mymax(lev, trfls));

@@ -5858,7 +5796,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -5882,7 +5820,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -5909,7 +5847,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
 #endif

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -5933,7 +5871,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6002,7 +5940,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
      }

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -6012,7 +5950,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
 #endif
@@ -6024,7 +5962,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
    else // no time refinement levels and for all same time levels
    {
 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
@@ -6034,7 +5972,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
 #endif
@@ -6089,7 +6027,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      }

 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry);
@@ -6099,7 +6037,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6113,7 +6051,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
      if (myrank == 0)
        cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
 #if (RPB == 0)
-      Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
+      Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
 #elif (RPB == 1)
      //       Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry);
      Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry);
@@ -6123,7 +6061,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6164,7 +6102,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)

 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry);
 #endif
@@ -6177,7 +6115,7 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB)
    {
 #if (RPB == 0)
 #if (MIXOUTB == 0)
-      Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
+      Parallel::OutBdLow2Hi(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #elif (MIXOUTB == 1)
      Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry);
 #endif
@@ -6298,7 +6236,7 @@ for(int ilev = GH->levels-1;ilev>=lev;ilev--)
 for(int ilev=GH->levels-1;ilev>lev;ilev--)
    RestrictProlong(ilev,1,false,DG_List,DG_List,DG_List);
 #else
-  Parallel::Sync_cached(GH->PatL[lev], DG_List, Symmetry, sync_cache_psi4[lev]);
+  Parallel::Sync(GH->PatL[lev], DG_List, Symmetry);
 #endif

 #ifdef WithShell
@@ -6953,10 +6891,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
 {
  LastAnas += dT_lev;

-  if (LastAnas >= AnasTime)
-  {
-#ifdef Point_Psi4
-#error "not support parallel levels yet"
+  if (LastAnas >= AnasTime)
+  {
+#ifdef Point_Psi4
+#error "not support parallel levels yet"
    // Gam_ijk and R_ij have been calculated in Interp_Constraint()
    double SYM = 1, ANT = -1;
    for (int levh = lev; levh < GH->levels; levh++)
@@ -7300,9 +7238,9 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)

 //================================================================================================

-void bssn_class::Constraint_Out()
-{
-  LastConsOut += dT * pow(0.5, Mymax(0, trfls));
+void bssn_class::Constraint_Out()
+{
+  LastConsOut += dT * pow(0.5, Mymax(0, trfls));

  if (LastConsOut >= AnasTime)
  // Constraint violation
@@ -7322,15 +7260,12 @@ void bssn_class::Constraint_Out()
          MyList<Block> *BP = Pp->data->blb;
          while (BP)
          {
-            Block *cg = BP->data;
-            if (myrank == cg->rank)
-            {
-#ifdef USE_GPU
-              gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Constraint_Out);
-#else
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+            Block *cg = BP->data;
+            if (myrank == cg->rank)
+            {
+              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7358,12 +7293,11 @@ void bssn_class::Constraint_Out()
                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                 cg->fgfs[Cons_Ham->sgfn],
-                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, pre);
-#endif
-            }
+                                 cg->fgfs[Cons_Ham->sgfn],
+                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                 Symmetry, lev, ndeps, pre);
+            }
            if (BP == Pp->data->ble)
              break;
            BP = BP->next;
@@ -7371,7 +7305,7 @@ void bssn_class::Constraint_Out()
          Pp = Pp->next;
        }
      }
-      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Constraint_Out[level]");
+      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
    }
 #ifdef WithShell
    if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -7593,7 +7527,7 @@ void bssn_class::AH_Prepare_derivatives()
      }
      Pp = Pp->next;
    }
-    Parallel::Sync(GH->PatL[lev], AHDList, Symmetry, "bssn_class::AH_Prepare_derivatives");
+    Parallel::Sync(GH->PatL[lev], AHDList, Symmetry);
  }
 }

@@ -7829,15 +7763,12 @@ void bssn_class::Interp_Constraint(bool infg)
          MyList<Block> *BP = Pp->data->blb;
          while (BP)
          {
-            Block *cg = BP->data;
-            if (myrank == cg->rank)
-            {
-#ifdef USE_GPU
-              gpu_rhs(CALLED_BY_CONSTRAINT_CONS_ONLY, myrank, RHS_PARA_CALLED_Interp_Constraint);
-#else
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
-                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
-                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+            Block *cg = BP->data;
+            if (myrank == cg->rank)
+            {
+              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                 cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
+                                 cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                 cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
                                 cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
                                 cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
@@ -7865,12 +7796,11 @@ void bssn_class::Interp_Constraint(bool infg)
                                 cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn],
                                 cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], 
                                 cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn],
-                                 cg->fgfs[Cons_Ham->sgfn],
-                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
-                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, pre);
-#endif
-            }
+                                 cg->fgfs[Cons_Ham->sgfn],
+                                 cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
+                                 cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
+                                 Symmetry, lev, ndeps, pre);
+            }
            if (BP == Pp->data->ble)
              break;
            BP = BP->next;
@@ -7878,7 +7808,7 @@ void bssn_class::Interp_Constraint(bool infg)
          Pp = Pp->next;
        }
      }
-      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Interp_Constraint[level]");
+      Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
    }
 #ifdef WithShell
    if (0) // if the constrait quantities can be reused from the step rhs calculation
@@ -8136,7 +8066,7 @@ void bssn_class::Compute_Constraint()
        Pp = Pp->next;
      }
    }
-    Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry, "bssn_class::Compute_Constraint[level]");
+    Parallel::Sync(GH->PatL[lev], ConstraintList, Symmetry);
  }
  // prolong restrict constraint quantities
  for (lev = GH->levels - 1; lev > 0; lev--)
@@ -8449,18 +8379,12 @@ void bssn_class::Enforce_algcon(int lev, int fg)

 //================================================================================================

-bool bssn_class::check_Stdin_Abort() 
-{
-    // Non-interactive launches (mpirun via Python/subprocess, batch jobs, redirected stdin)
-    // should not probe stdin. Some MPI runtimes treat stdin as a managed channel and can
-    // fail when rank 0 polls/consumes it.
-    if (!isatty(STDIN_FILENO)) {
-        return false;
-    }
-
-    fd_set readfds;
-
-    struct timeval timeout;
+bool bssn_class::check_Stdin_Abort() 
+{
+
+    fd_set readfds;
+
+    struct timeval timeout;

    FD_ZERO(&readfds);
    FD_SET(STDIN_FILENO, &readfds);
@@ -8469,17 +8393,14 @@ bool bssn_class::check_Stdin_Abort()
    timeout.tv_sec = 0;
    timeout.tv_usec = 0;

-    int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
-    if (activity <= 0) {
-        return false;
-    }
-
-    if (FD_ISSET(STDIN_FILENO, &readfds)) {
-        string input_abort;
-        if (cin >> input_abort) {
-            if (input_abort == "stop") {
-                return true;
-            }
+    int activity = select(STDIN_FILENO + 1, &readfds, nullptr, nullptr, &timeout);
+
+    if (activity > 0 && FD_ISSET(STDIN_FILENO, &readfds)) {
+        string input_abort;
+        if (cin >> input_abort) {
+            if (input_abort == "stop") {
+                return true;
+            }
        }
    }

--- a/AMSS_NCKU_source/bssn_class.h
+++ b/AMSS_NCKU_source/bssn_class.h
@@ -128,11 +128,8 @@ public:

       Parallel::SyncCache *sync_cache_pre;  // per-level cache for predictor sync
       Parallel::SyncCache *sync_cache_cor;  // per-level cache for corrector sync
-       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
-       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]
-       Parallel::SyncCache *sync_cache_restrict;   // cached Restrict in RestrictProlong
-       Parallel::SyncCache *sync_cache_outbd;      // cached OutBdLow2Hi in RestrictProlong
-       Parallel::SyncCache *sync_cache_psi4;       // cached Psi4 sync on PatL[lev]
+       Parallel::SyncCache *sync_cache_rp_coarse;  // RestrictProlong sync on PatL[lev-1]
+       Parallel::SyncCache *sync_cache_rp_fine;    // RestrictProlong sync on PatL[lev]

       monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
       monitor *ConVMonitor;
@@ -172,20 +169,16 @@ public:
       
       bool check_Stdin_Abort(); 

-       virtual void Setup_Initial_Data_Cao();
-       virtual void Setup_Initial_Data_Lousto();
-       virtual void Initialize();
-       virtual void Read_Ansorg();
-       virtual void Read_Pablo() {};
-       void InvalidateSyncCaches();
-       virtual void Compute_Psi4(int lev);
-       virtual void Step(int lev, int YN);
-#ifdef USE_GPU
-       void Step_MainPath_GPU(int lev, int YN);
-#endif
-       virtual void Interp_Constraint(bool infg);
-       virtual void Constraint_Out();
-       virtual void Compute_Constraint();
+       virtual void Setup_Initial_Data_Cao();
+       virtual void Setup_Initial_Data_Lousto();
+       virtual void Initialize();
+       virtual void Read_Ansorg();
+       virtual void Read_Pablo() {};
+       virtual void Compute_Psi4(int lev);
+       virtual void Step(int lev, int YN);
+       virtual void Interp_Constraint(bool infg);
+       virtual void Constraint_Out();
+       virtual void Compute_Constraint();

 #ifdef With_AHF
 protected:
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -1,68 +0,0 @@
-#ifndef BSSN_CUDA_OPS_H
-#define BSSN_CUDA_OPS_H
-
-int bssn_cuda_enforce_ga(int *ex,
-                         double *dxx, double *gxy, double *gxz,
-                         double *dyy, double *gyz, double *dzz,
-                         double *Axx, double *Axy, double *Axz,
-                         double *Ayy, double *Ayz, double *Azz);
-
-int bssn_cuda_rk4_boundary_var(int *ex, double dT,
-                               const double *X, const double *Y, const double *Z,
-                               double xmin, double ymin, double zmin,
-                               double xmax, double ymax, double zmax,
-                               const double *state0,
-                               const double *phi_field,
-                               const double *lap_field,
-                               const double *boundary_src,
-                               double *stage_data,
-                               double *rhs_accum,
-                               double propspeed,
-                               const double SoA[3],
-                               int symmetry,
-                               int lev,
-                               int rk_stage,
-                               bool force_host_boundary_fix,
-                               bool download_to_host = true);
-
-int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
-                                 const double *X, const double *Y, const double *Z,
-                                 double xmin, double ymin, double zmin,
-                                 double xmax, double ymax, double zmax,
-                                 int symmetry,
-                                 const double *const *state0_list,
-                                 double *const *stage_data_list,
-                                 double *const *rhs_accum_list,
-                                 int num_var,
-                                 int rk_stage,
-                                 bool download_to_host = false);
-
-int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true);
-int bssn_cuda_download_buffer(int *ex, double *host_ptr);
-void bssn_cuda_release_rk4_caches();
-void bssn_cuda_release_interp_caches();
-
-int bssn_cuda_prolong3_pack(int wei,
-                            const double *llbc, const double *uubc, const int *extc, const double *func,
-                            const double *llbf, const double *uubf, const int *extf, double *funf,
-                            const double *llbp, const double *uubp,
-                            const double *SoA, int symmetry);
-
-int bssn_cuda_restrict3_pack(int wei,
-                             const double *llbc, const double *uubc, const int *extc, double *func,
-                             const double *llbf, const double *uubf, const int *extf, const double *funf,
-                             const double *llbr, const double *uubr,
-                             const double *SoA, int symmetry);
-
-int bssn_cuda_interp_points_batch(const int *ex,
-                                  const double *X, const double *Y, const double *Z,
-                                  const double *const *fields,
-                                  const double *soa_flat,
-                                  int num_var,
-                                  const double *px, const double *py, const double *pz,
-                                  int num_points,
-                                  int ordn,
-                                  int symmetry,
-                                  double *out);
-
-#endif
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -1,936 +0,0 @@
-#include "macrodef.h"
-
-#ifdef USE_GPU
-
-#include <algorithm>
-#include <cmath>
-#include <cstring>
-#include <cstdlib>
-#include <iomanip>
-#include <vector>
-
-#include "bssn_class.h"
-#include "bssn_cuda_ops.h"
-#include "bssn_gpu.h"
-#include "bssn_macro.h"
-
-namespace
-{
-enum StageProfileMetric
-{
-  STAGE_PROFILE_TOTAL = 0,
-  STAGE_PROFILE_RHS,
-  STAGE_PROFILE_RUN_STAGE,
-  STAGE_PROFILE_RUN_STAGE_DEVICE,
-  STAGE_PROFILE_RUN_STAGE_HOST_FIX,
-  STAGE_PROFILE_LOWERBOUND,
-  STAGE_PROFILE_ENSURE,
-  STAGE_PROFILE_DOWNLOAD,
-  STAGE_PROFILE_CLEAR_CACHE,
-  STAGE_PROFILE_SYNC_START,
-  STAGE_PROFILE_SYNC_FINISH,
-  STAGE_PROFILE_REFRESH,
-  STAGE_PROFILE_COUNT
-};
-
-static const int kStageProfileMaxLevels = 32;
-
-struct StageProfileStore
-{
-  bool env_checked;
-  bool enabled;
-  int calls[kStageProfileMaxLevels];
-  double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
-};
-
-StageProfileStore &stage_profile_store()
-{
-  static StageProfileStore store = {};
-  return store;
-}
-
-bool stage_profile_enabled()
-{
-  StageProfileStore &store = stage_profile_store();
-  if (!store.env_checked)
-  {
-    const char *env = getenv("AMSS_GPU_STAGE_TIMING");
-    store.enabled = (env && env[0] && strcmp(env, "0") != 0);
-    store.env_checked = true;
-  }
-  return store.enabled;
-}
-
-void stage_profile_note_call(int lev)
-{
-  if (lev >= 0 && lev < kStageProfileMaxLevels)
-    stage_profile_store().calls[lev]++;
-}
-
-void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
-{
-  if (lev >= 0 && lev < kStageProfileMaxLevels)
-    stage_profile_store().metric[lev][metric] += seconds;
-}
-
-const char *stage_profile_metric_name(StageProfileMetric metric)
-{
-  switch (metric)
-  {
-  case STAGE_PROFILE_TOTAL:
-    return "total";
-  case STAGE_PROFILE_RHS:
-    return "rhs";
-  case STAGE_PROFILE_RUN_STAGE:
-    return "run_stage";
-  case STAGE_PROFILE_RUN_STAGE_DEVICE:
-    return "run_stage_dev";
-  case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
-    return "run_stage_host";
-  case STAGE_PROFILE_LOWERBOUND:
-    return "lower";
-  case STAGE_PROFILE_ENSURE:
-    return "ensure";
-  case STAGE_PROFILE_DOWNLOAD:
-    return "download";
-  case STAGE_PROFILE_CLEAR_CACHE:
-    return "clear_cache";
-  case STAGE_PROFILE_SYNC_START:
-    return "sync_start";
-  case STAGE_PROFILE_SYNC_FINISH:
-    return "sync_finish";
-  case STAGE_PROFILE_REFRESH:
-    return "refresh";
-  default:
-    return "unknown";
-  }
-}
-} // namespace
-
-void bssn_cuda_dump_stage_profile()
-{
-  if (!stage_profile_enabled())
-    return;
-
-  int myrank = 0;
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-
-  StageProfileStore &store = stage_profile_store();
-  int global_calls_sum[kStageProfileMaxLevels] = {};
-  double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
-  double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
-
-  MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
-  MPI_Reduce(store.metric[0], global_metric_sum[0],
-             kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
-             MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-  MPI_Reduce(store.metric[0], global_metric_max[0],
-             kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
-             MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-
-  if (myrank != 0)
-    return;
-
-  cout << endl;
-  cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
-  cout << " lev  calls";
-  for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
-    cout << "  " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
-  cout << endl;
-
-  for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
-  {
-    if (global_calls_sum[lev] == 0)
-      continue;
-
-    cout << setw(4) << lev << "  " << setw(5) << global_calls_sum[lev];
-    for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
-    {
-      cout << "  "
-           << setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
-           << "/"
-           << setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
-    }
-    cout << endl;
-  }
-  cout << endl;
-}
-
-void bssn_class::Step_MainPath_GPU(int lev, int YN)
-{
-#ifdef WithShell
-#error "Step_MainPath_GPU currently supports Patch grids only."
-#endif
-
-  const bool profile_enabled = stage_profile_enabled();
-  const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
-  if (profile_enabled)
-    stage_profile_note_call(lev);
-
-  if (bssn_gpu_bind_process_device(myrank))
-  {
-    cerr << "GPU device bind failure on MPI rank " << myrank << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    bssn_gpu_clear_cached_device_buffers();
-    stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-  }
-  else
-    bssn_gpu_clear_cached_device_buffers();
-
-  setpbh(BH_num, Porg0, Mass, BH_num_input);
-
-  const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
-
-#if (MAPBH == 1)
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      for (int ith = 0; ith < 3; ith++)
-        Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
-      if (Symmetry > 0)
-        Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
-      if (Symmetry == 2)
-      {
-        Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
-        Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
-      }
-    }
-  }
-
-  if (lev == a_lev)
-    AnalysisStuff(lev, dT_lev);
-#endif
-
-#ifdef With_AHF
-  AH_Step_Find(lev, dT_lev);
-#endif
-
-  const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
-  (void)BB;
-  double ndeps = (lev < GH->movls) ? numepsb : numepss;
-  double TRK4 = PhysTime;
-  int iter_count = 0;
-  int pre = 0, cor = 1;
-  int ERROR = 0;
-  const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);
-
-  auto run_stage_on_block =
-      [&](Block *cg, Patch *patch, MyList<var> *state0_list,
-          MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
-          MyList<var> *rhs_list, int rk_stage) {
-        MyList<var> *varl0 = state0_list;
-        MyList<var> *varlb = boundary_src_list;
-        MyList<var> *varls = stage_data_list;
-        MyList<var> *varlr = rhs_list;
-        std::vector<const double *> batch_state0;
-        std::vector<double *> batch_stage;
-        std::vector<double *> batch_rhs;
-
-        while (varl0)
-        {
-          const bool force_host_boundary_fix = false;
-          const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
-          if (can_batch_device_path)
-          {
-            batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
-            batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
-            batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
-            varl0 = varl0->next;
-            varlb = varlb->next;
-            varls = varls->next;
-            varlr = varlr->next;
-            continue;
-          }
-
-          const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
-          if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
-                                         cg->X[0], cg->X[1], cg->X[2],
-                                         patch->bbox[0], patch->bbox[1], patch->bbox[2],
-                                         patch->bbox[3], patch->bbox[4], patch->bbox[5],
-                                         cg->fgfs[varl0->data->sgfn],
-                                         cg->fgfs[phi0->sgfn],
-                                         cg->fgfs[Lap0->sgfn],
-                                         cg->fgfs[varlb->data->sgfn],
-                                         cg->fgfs[varls->data->sgfn],
-                                         cg->fgfs[varlr->data->sgfn],
-                                         varl0->data->propspeed,
-                                         varl0->data->SoA,
-                                         Symmetry, lev, rk_stage,
-                                         force_host_boundary_fix, false))
-          {
-            cerr << "GPU rk4/boundary failure: lev=" << lev
-                 << " rk_stage=" << rk_stage
-                 << " var=" << varl0->data->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-            break;
-          }
-          if (profile_enabled)
-          {
-            stage_profile_add(lev,
-                              force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
-                                                      : STAGE_PROFILE_RUN_STAGE_DEVICE,
-                              MPI_Wtime() - var_begin);
-          }
-          varl0 = varl0->next;
-          varlb = varlb->next;
-          varls = varls->next;
-          varlr = varlr->next;
-        }
-
-        if (!ERROR && !batch_state0.empty())
-        {
-          const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
-          if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
-                                           cg->X[0], cg->X[1], cg->X[2],
-                                           patch->bbox[0], patch->bbox[1], patch->bbox[2],
-                                           patch->bbox[3], patch->bbox[4], patch->bbox[5],
-                                           Symmetry,
-                                           &batch_state0[0],
-                                           &batch_stage[0],
-                                           &batch_rhs[0],
-                                           static_cast<int>(batch_state0.size()),
-                                           rk_stage, false))
-          {
-            cerr << "GPU rk4/boundary batch failure: lev=" << lev
-                 << " rk_stage=" << rk_stage
-                 << " vars=" << batch_state0.size()
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-          }
-          else if (profile_enabled)
-          {
-            stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
-          }
-        }
-      };
-
-  auto stage_download_var_list =
-      [&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
-        std::vector<double *> batch_host_ptrs;
-        std::vector<MyList<var> *> batch_vars;
-        while (var_list)
-        {
-          double *host_ptr = cg->fgfs[var_list->data->sgfn];
-          if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
-          {
-            var_list = var_list->next;
-            continue;
-          }
-          batch_host_ptrs.push_back(host_ptr);
-          batch_vars.push_back(var_list);
-          var_list = var_list->next;
-        }
-        if (!batch_host_ptrs.empty() &&
-            bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
-                                           static_cast<int>(batch_host_ptrs.size())))
-        {
-          for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
-          {
-            if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
-            {
-              cerr << "GPU stage download failure: lev=" << lev
-                   << " var=" << batch_vars[i]->data->name
-                   << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-              ERROR = 1;
-              break;
-            }
-          }
-        }
-      };
-
-  auto stage_download_patch_list =
-      [&](MyList<var> *var_list, bool skip_unmapped) {
-        MyList<Patch> *patch_it = GH->PatL[lev];
-        while (patch_it)
-        {
-          MyList<Block> *block_it = patch_it->data->blb;
-          while (block_it)
-          {
-            Block *cg = block_it->data;
-            if (myrank == cg->rank)
-              stage_download_var_list(cg, var_list, skip_unmapped);
-
-            if (block_it == patch_it->data->ble)
-              break;
-            block_it = block_it->next;
-          }
-          if (ERROR)
-            break;
-          patch_it = patch_it->next;
-        }
-      };
-
-  auto ensure_stage_device_var_list =
-      [&](Block *cg, MyList<var> *var_list) {
-        const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
-        while (var_list)
-        {
-          double *host_ptr = cg->fgfs[var_list->data->sgfn];
-          if (!bssn_gpu_find_device_buffer(host_ptr) &&
-              bssn_gpu_stage_upload_buffer(host_ptr, n))
-          {
-            cerr << "GPU state ensure failure: lev=" << lev
-                 << " var=" << var_list->data->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-            break;
-          }
-          var_list = var_list->next;
-        }
-      };
-
-  auto refresh_synced_device_regions =
-      [&](Block *cg, MyList<var> *var_list, Parallel::SyncCache &cache) {
-        std::vector<Parallel::gridseg *> local_segments;
-        for (int node = 0; node < cache.cpusize; ++node)
-        {
-          MyList<Parallel::gridseg> *seg = cache.combined_dst[node];
-          while (seg)
-          {
-            if (seg->data && seg->data->Bg == cg)
-              local_segments.push_back(seg->data);
-            seg = seg->next;
-          }
-        }
-
-        if (local_segments.empty())
-          return;
-
-        const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
-        while (var_list)
-        {
-          double *host_ptr = cg->fgfs[var_list->data->sgfn];
-          if (!bssn_gpu_find_device_buffer(host_ptr))
-          {
-            if (bssn_gpu_stage_upload_buffer(host_ptr, n))
-            {
-              cerr << "GPU sync refresh upload failure: lev=" << lev
-                   << " var=" << var_list->data->name
-                   << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                   << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                   << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-              ERROR = 1;
-              break;
-            }
-          }
-          else
-          {
-            for (size_t i = 0; i < local_segments.size(); ++i)
-            {
-              Parallel::gridseg *seg = local_segments[i];
-              if (bssn_gpu_stage_upload_region(host_ptr,
-                                               cg->shape,
-                                               cg->bbox,
-                                               cg->bbox + dim,
-                                               seg->shape,
-                                               seg->llb))
-              {
-                cerr << "GPU sync region refresh failure: lev=" << lev
-                     << " var=" << var_list->data->name
-                     << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                     << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                     << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-                ERROR = 1;
-                break;
-              }
-            }
-            if (ERROR)
-              break;
-          }
-          var_list = var_list->next;
-        }
-      };
-
-  auto refresh_stage_device_after_sync =
-      [&](MyList<var> *var_list, Parallel::SyncCache &cache) {
-        MyList<Patch> *patch_it = GH->PatL[lev];
-        while (patch_it)
-        {
-          MyList<Block> *block_it = patch_it->data->blb;
-          while (block_it)
-          {
-            Block *cg = block_it->data;
-            if (myrank == cg->rank)
-              refresh_synced_device_regions(cg, var_list, cache);
-
-            if (block_it == patch_it->data->ble)
-              break;
-            block_it = block_it->next;
-          }
-          if (ERROR)
-            break;
-          patch_it = patch_it->next;
-        }
-      };
-
-  auto refresh_stage_host_before_sync =
-      [&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
-        if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
-          return false;
-
-        MyList<Patch> *patch_it = GH->PatL[lev];
-        while (patch_it)
-        {
-          MyList<Block> *block_it = patch_it->data->blb;
-          while (block_it)
-          {
-            Block *cg = block_it->data;
-            if (myrank == cg->rank)
-            {
-              std::vector<Parallel::gridseg *> local_segments;
-              MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
-              while (seg)
-              {
-                if (seg->data && seg->data->Bg == cg)
-                  local_segments.push_back(seg->data);
-                seg = seg->next;
-              }
-
-              if (!local_segments.empty())
-              {
-                MyList<var> *var_it = var_list;
-                while (var_it)
-                {
-                  double *host_ptr = cg->fgfs[var_it->data->sgfn];
-                  for (size_t i = 0; i < local_segments.size(); ++i)
-                  {
-                    Parallel::gridseg *src_seg = local_segments[i];
-                    if (bssn_gpu_stage_download_region(host_ptr,
-                                                       cg->shape,
-                                                       cg->bbox,
-                                                       cg->bbox + dim,
-                                                       src_seg->shape,
-                                                       src_seg->llb))
-                    {
-                      cerr << "GPU sync region download failure: lev=" << lev
-                           << " var=" << var_it->data->name
-                           << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                           << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                           << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-                      ERROR = 1;
-                      return true;
-                    }
-                  }
-                  var_it = var_it->next;
-                }
-              }
-            }
-
-            if (block_it == patch_it->data->ble)
-              break;
-            block_it = block_it->next;
-          }
-          patch_it = patch_it->next;
-        }
-
-        return true;
-      };
-
-  auto can_pack_sync_from_device =
-      [&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
-        if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
-          return false;
-
-        MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
-        while (seg)
-        {
-          MyList<var> *var_it = var_list;
-          while (var_it)
-          {
-            if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
-              return false;
-            var_it = var_it->next;
-          }
-          seg = seg->next;
-        }
-        return true;
-      };
-
-  MyList<Patch> *Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      if (myrank == cg->rank)
-      {
-        double t0 = 0.0;
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
-          ERROR = 1;
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
-
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
-
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
-        {
-          cerr << "GPU lowerbound failure: lev=" << lev
-               << " rk_stage=" << iter_count
-               << " var=" << phi->name
-               << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-               << cg->bbox[1] << ":" << cg->bbox[4] << ","
-               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-          ERROR = 1;
-        }
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
-      }
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  if (!ERROR)
-  {
-    if (!keep_stage_sync_on_device)
-    {
-      double t0 = 0.0;
-      if (profile_enabled)
-        t0 = MPI_Wtime();
-      stage_download_patch_list(SynchList_pre, false);
-      if (profile_enabled)
-        stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
-      if (!ERROR)
-      {
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        bssn_gpu_clear_cached_device_buffers();
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-      }
-    }
-  }
-
-  MPI_Request err_req_pre;
-  {
-    int erh = ERROR;
-    MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
-  }
-
-  Parallel::AsyncSyncState async_pre;
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-    stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
-  }
-  else
-    Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
-                          !keep_stage_sync_on_device);
-    stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
-  }
-  else
-    Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
-                          !keep_stage_sync_on_device);
-  if (!ERROR && !keep_stage_sync_on_device)
-  {
-    if (profile_enabled)
-    {
-      const double t0 = MPI_Wtime();
-      refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
-      stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
-    }
-    else
-      refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
-  }
-
-  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
-  if (ERROR)
-  {
-    Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
-    if (myrank == 0)
-    {
-      if (ErrorMonitor->outfile)
-        ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
-                              << ", lev = " << lev << endl;
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-#if (MAPBH == 0)
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-      f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-      if (Symmetry > 0)
-        Porg[ithBH][2] = fabs(Porg[ithBH][2]);
-      if (Symmetry == 2)
-      {
-        Porg[ithBH][0] = fabs(Porg[ithBH][0]);
-        Porg[ithBH][1] = fabs(Porg[ithBH][1]);
-      }
-    }
-  }
-
-  if (lev == a_lev)
-    AnalysisStuff(lev, dT_lev);
-#endif
-
-  for (iter_count = 1; iter_count < 4; iter_count++)
-  {
-    if (iter_count == 1 || iter_count == 3)
-      TRK4 += dT_lev / 2;
-
-    Pp = GH->PatL[lev];
-    while (Pp)
-    {
-      MyList<Block> *BP = Pp->data->blb;
-      while (BP)
-      {
-        Block *cg = BP->data;
-        if (myrank == cg->rank)
-        {
-          double t0 = 0.0;
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          ensure_stage_device_var_list(cg, SynchList_pre);
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
-
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
-            ERROR = 1;
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
-
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
-
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
-          {
-          cerr << "GPU lowerbound failure: lev=" << lev
-               << " rk_stage=" << iter_count
-               << " var=" << phi1->name
-                 << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
-                 << cg->bbox[1] << ":" << cg->bbox[4] << ","
-                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
-            ERROR = 1;
-          }
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
-        }
-
-        if (BP == Pp->data->ble)
-          break;
-        BP = BP->next;
-      }
-      Pp = Pp->next;
-    }
-
-    if (!ERROR)
-    {
-      if (!keep_stage_sync_on_device)
-      {
-        double t0 = 0.0;
-        if (profile_enabled)
-          t0 = MPI_Wtime();
-        stage_download_patch_list(SynchList_cor, false);
-        if (profile_enabled)
-          stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
-        if (!ERROR)
-        {
-          if (profile_enabled)
-            t0 = MPI_Wtime();
-          bssn_gpu_clear_cached_device_buffers();
-          if (profile_enabled)
-            stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-        }
-      }
-    }
-
-    MPI_Request err_req_cor;
-    {
-      int erh = ERROR;
-      MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
-    }
-
-    Parallel::AsyncSyncState async_cor;
-    if (profile_enabled)
-    {
-      const double t0 = MPI_Wtime();
-      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-      stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
-    }
-    else
-      Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-    if (profile_enabled)
-    {
-      const double t0 = MPI_Wtime();
-      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
-                            !keep_stage_sync_on_device);
-      stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
-    }
-    else
-      Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
-                            !keep_stage_sync_on_device);
-    if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
-    {
-      if (profile_enabled)
-      {
-        const double t0 = MPI_Wtime();
-        refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
-        stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
-      }
-      else
-        refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
-    }
-
-    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
-    if (ERROR)
-    {
-      Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
-      if (myrank == 0)
-      {
-        if (ErrorMonitor->outfile)
-          ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
-                                << " variables at t = " << PhysTime
-                                << ", lev = " << lev << endl;
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-#if (MAPBH == 0)
-    if (BH_num > 0 && lev == GH->levels - 1)
-    {
-      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
-      for (int ithBH = 0; ithBH < BH_num; ithBH++)
-      {
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
-        f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
-        if (Symmetry > 0)
-          Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
-        if (Symmetry == 2)
-        {
-          Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
-          Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
-        }
-      }
-    }
-#endif
-
-    if (iter_count < 3)
-    {
-      Pp = GH->PatL[lev];
-      while (Pp)
-      {
-        MyList<Block> *BP = Pp->data->blb;
-        while (BP)
-        {
-          BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
-          if (BP == Pp->data->ble)
-            break;
-          BP = BP->next;
-        }
-        Pp = Pp->next;
-      }
-
-#if (MAPBH == 0)
-      if (BH_num > 0 && lev == GH->levels - 1)
-      {
-        for (int ithBH = 0; ithBH < BH_num; ithBH++)
-        {
-          Porg[ithBH][0] = Porg1[ithBH][0];
-          Porg[ithBH][1] = Porg1[ithBH][1];
-          Porg[ithBH][2] = Porg1[ithBH][2];
-        }
-      }
-#endif
-    }
-  }
-
-#if (RPS == 0)
-  RestrictProlong(lev, YN, BB);
-#endif
-
-  Pp = GH->PatL[lev];
-  while (Pp)
-  {
-    MyList<Block> *BP = Pp->data->blb;
-    while (BP)
-    {
-      Block *cg = BP->data;
-      cg->swapList(StateList, SynchList_cor, myrank);
-      cg->swapList(OldStateList, SynchList_cor, myrank);
-      if (BP == Pp->data->ble)
-        break;
-      BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-
-  if (!ERROR && keep_stage_sync_on_device)
-  {
-    // After the swaps above, only StateList points at arrays updated during this step.
-    // OldStateList/SynchList_cor remain valid on host because their backing arrays were
-    // read-only during the RK step, and SynchList_pre is reused only as scratch later.
-    const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
-    stage_download_patch_list(StateList, true);
-    if (profile_enabled)
-      stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
-  }
-
-  if (profile_enabled)
-  {
-    const double t0 = MPI_Wtime();
-    bssn_gpu_clear_cached_device_buffers();
-    stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
-  }
-  else
-    bssn_gpu_clear_cached_device_buffers();
-
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      Porg0[ithBH][0] = Porg1[ithBH][0];
-      Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-
-  if (profile_enabled)
-    stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
-}
-
-#endif
--- a/AMSS_NCKU_source/bssn_gpu.cu
+++ b/AMSS_NCKU_source/bssn_gpu.cu
--- a/AMSS_NCKU_source/bssn_gpu.h
+++ b/AMSS_NCKU_source/bssn_gpu.h
@@ -4,8 +4,10 @@
 #include "bssn_macro.h"
 #include "macrodef.fh"

-#define GRID_DIM 256
-#define BLOCK_DIM 128
+#define DEVICE_ID 0
+// #define DEVICE_ID_BY_MPI_RANK
+#define GRID_DIM 256
+#define BLOCK_DIM 128

 #define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
 #define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
@@ -63,45 +65,9 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
            int &Symmetry, int &Lev, double &eps, int &co);

-int gpu_rhs_ss(RHS_SS_PARA);
-
-int bssn_gpu_bind_process_device(int mpi_rank);
-void bssn_gpu_clear_cached_device_buffers();
-void bssn_gpu_release_pinned_host_buffers();
-const double *bssn_gpu_find_device_buffer(const double *host_ptr);
-void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr);
-void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count);
-int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count);
-int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count);
-int bssn_gpu_stage_upload_region(const double *host_ptr,
-                                 const int *full_shape,
-                                 const double *full_llb,
-                                 const double *full_uub,
-                                 const int *region_shape,
-                                 const double *region_llb);
-int bssn_gpu_stage_download_region(double *host_ptr,
-                                   const int *full_shape,
-                                   const double *full_llb,
-                                   const double *full_uub,
-                                   const int *region_shape,
-                                   const double *region_llb);
-int bssn_gpu_stage_download_region_to_buffer(const double *host_src_ptr,
-                                             const int *full_shape,
-                                             const double *full_llb,
-                                             const double *full_uub,
-                                             const int *region_shape,
-                                             const double *region_llb,
-                                             double *host_dst_ptr);
-int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
-                                           double *host_dst_ptr,
-                                           const int *full_shape,
-                                           const double *full_llb,
-                                           const double *full_uub,
-                                           const int *region_shape,
-                                           const double *region_llb);
-int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
-
-/** Init GPU side data in GPUMeta. */
-// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
+int gpu_rhs_ss(RHS_SS_PARA);
+
+/** Init GPU side data in GPUMeta. */
+// void init_fluid_meta_gpu(GPUMeta *gpu_meta);

 #endif
--- a/AMSS_NCKU_source/bssn_macro.h
+++ b/AMSS_NCKU_source/bssn_macro.h
@@ -65,10 +65,9 @@ if(TIME_COUNT_EACH_RANK == 1){\
     }\
 }

-//3---------------------GPU---------------------
-#define CALLED_BY_STEP 0
-#define CALLED_BY_CONSTRAINT 1
-#define CALLED_BY_CONSTRAINT_CONS_ONLY 2
+//3---------------------GPU---------------------
+#define CALLED_BY_STEP 0
+#define CALLED_BY_CONSTRAINT 1

 #define RHS_PARA_CALLED_FIRST_TIME cg->shape,TRK4,cg->X[0],cg->X[1],cg->X[2],cg->fgfs[phi0->sgfn],cg->fgfs[trK0->sgfn],cg->fgfs[gxx0->sgfn],cg->fgfs[gxy0->sgfn],cg->fgfs[gxz0->sgfn],cg->fgfs[gyy0->sgfn],cg->fgfs[gyz0->sgfn],cg->fgfs[gzz0->sgfn],cg->fgfs[Axx0->sgfn],cg->fgfs[Axy0->sgfn],cg->fgfs[Axz0->sgfn],cg->fgfs[Ayy0->sgfn],cg->fgfs[Ayz0->sgfn],cg->fgfs[Azz0->sgfn],cg->fgfs[Gmx0->sgfn],cg->fgfs[Gmy0->sgfn],cg->fgfs[Gmz0->sgfn],cg->fgfs[Lap0->sgfn],cg->fgfs[Sfx0->sgfn],cg->fgfs[Sfy0->sgfn],cg->fgfs[Sfz0->sgfn],cg->fgfs[dtSfx0->sgfn],cg->fgfs[dtSfy0->sgfn],cg->fgfs[dtSfz0->sgfn],cg->fgfs[phi_rhs->sgfn],cg->fgfs[trK_rhs->sgfn],cg->fgfs[gxx_rhs->sgfn],cg->fgfs[gxy_rhs->sgfn],cg->fgfs[gxz_rhs->sgfn],cg->fgfs[gyy_rhs->sgfn],cg->fgfs[gyz_rhs->sgfn],cg->fgfs[gzz_rhs->sgfn],cg->fgfs[Axx_rhs->sgfn],cg->fgfs[Axy_rhs->sgfn],cg->fgfs[Axz_rhs->sgfn],cg->fgfs[Ayy_rhs->sgfn],cg->fgfs[Ayz_rhs->sgfn],cg->fgfs[Azz_rhs->sgfn],cg->fgfs[Gmx_rhs->sgfn],cg->fgfs[Gmy_rhs->sgfn],cg->fgfs[Gmz_rhs->sgfn],cg->fgfs[Lap_rhs->sgfn],cg->fgfs[Sfx_rhs->sgfn],cg->fgfs[Sfy_rhs->sgfn],cg->fgfs[Sfz_rhs->sgfn],cg->fgfs[dtSfx_rhs->sgfn],cg->fgfs[dtSfy_rhs->sgfn],cg->fgfs[dtSfz_rhs->sgfn],cg->fgfs[rho->sgfn],cg->fgfs[Sx->sgfn],cg->fgfs[Sy->sgfn],cg->fgfs[Sz->sgfn],cg->fgfs[Sxx->sgfn],cg->fgfs[Sxy->sgfn],cg->fgfs[Sxz->sgfn],cg->fgfs[Syy->sgfn],cg->fgfs[Syz->sgfn],cg->fgfs[Szz->sgfn],cg->fgfs[Gamxxx->sgfn],cg->fgfs[Gamxxy->sgfn],cg->fgfs[Gamxxz->sgfn],cg->fgfs[Gamxyy->sgfn],cg->fgfs[Gamxyz->sgfn],cg->fgfs[Gamxzz->sgfn],cg->fgfs[Gamyxx->sgfn],cg->fgfs[Gamyxy->sgfn],cg->fgfs[Gamyxz->sgfn],cg->fgfs[Gamyyy->sgfn],cg->fgfs[Gamyyz->sgfn],cg->fgfs[Gamyzz->sgfn],cg->fgfs[Gamzxx->sgfn],cg->fgfs[Gamzxy->sgfn],cg->fgfs[Gamzxz->sgfn],cg->fgfs[Gamzyy->sgfn],cg->fgfs[Gamzyz->sgfn],cg->fgfs[Gamzzz->sgfn],cg->fgfs[Rxx->sgfn],cg->fgfs[Rxy->sgfn],cg->fgfs[Rxz->sgfn],cg->fgfs[Ryy->sgfn],cg->fgfs[Ryz->sgfn],cg->fgfs[Rzz->sgfn],cg->fgfs[Cons_Ham->sgfn],cg->fgfs[Cons_Px->sgfn],cg->fgfs[Cons_Py->sgfn],cg->fgfs[Cons_Pz->sgfn],cg->fgfs[Cons_Gx->sgfn],cg->fgfs[Cons_Gy->sgfn],cg->fgfs[Cons_Gz->sgfn],Symmetry,lev,ndeps,pre

--- a/AMSS_NCKU_source/bssn_rhs.f90
+++ b/AMSS_NCKU_source/bssn_rhs.f90
@@ -59,10 +59,9 @@
  real*8, dimension(ex(1),ex(2),ex(3)),intent(out) :: Rxx,Rxy,Rxz,Ryy,Ryz,Rzz
  real*8,intent(in) :: eps
  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: ham_Res, movx_Res, movy_Res, movz_Res
-  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
-!  gont = 0: success; gont = 1: something wrong
-  integer::gont
-  integer :: i,j,k
+  real*8, dimension(ex(1),ex(2),ex(3)),intent(inout) :: Gmx_Res, Gmy_Res, Gmz_Res
+!  gont = 0: success; gont = 1: something wrong
+  integer::gont

 !~~~~~~> Other variables:

@@ -84,18 +83,11 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupxx,gupxy,gupxz
  real*8, dimension(ex(1),ex(2),ex(3)) :: gupyy,gupyz,gupzz

-  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
-  real*8            :: dX, dY, dZ, PI
-  real*8            :: divb_loc,det_loc
-  real*8            :: gupxx_loc,gupxy_loc,gupxz_loc,gupyy_loc,gupyz_loc,gupzz_loc
-  real*8            :: Rxx_loc,Rxy_loc,Rxz_loc,Ryy_loc,Ryz_loc,Rzz_loc
-  real*8            :: fxx_loc,fxy_loc,fxz_loc
-  real*8            :: Gamxa_loc,Gamya_loc,Gamza_loc
-  real*8            :: f_loc,chin_loc
-  real*8            :: l_fxx,l_fxy,l_fxz,l_fyy,l_fyz,l_fzz,S_loc
-  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
-  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
-  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
+  real*8,dimension(3) ::SSS,AAS,ASA,SAA,ASS,SAS,SSA
+  real*8            :: dX, dY, dZ, PI
+  real*8, parameter :: ZEO = 0.d0,ONE = 1.D0, TWO = 2.D0, FOUR = 4.D0
+  real*8, parameter :: EIGHT = 8.D0, HALF = 0.5D0, THR = 3.d0
+  real*8, parameter :: SYM = 1.D0, ANTI= - 1.D0
  double precision,parameter::FF = 0.75d0,eta=2.d0
  real*8, parameter :: F1o3 = 1.D0/3.D0, F2o3 = 2.D0/3.D0,F3o2=1.5d0, F1o6 = 1.D0/6.D0
  real*8, parameter :: F16=1.6d1,F8=8.d0
@@ -104,11 +96,11 @@
  real*8, dimension(ex(1),ex(2),ex(3)) :: reta
 #endif

-#if (GAUGE == 6 || GAUGE == 7)
-  integer :: BHN
-  real*8, dimension(9) :: Porg
-  real*8, dimension(3) :: Mass
-  real*8 :: r1,r2,M,A,w1,w2,C1,C2
+#if (GAUGE == 6 || GAUGE == 7)
+  integer :: BHN,i,j,k
+  real*8, dimension(9) :: Porg
+  real*8, dimension(3) :: Mass
+  real*8 :: r1,r2,M,A,w1,w2,C1,C2
  real*8, dimension(ex(1),ex(2),ex(3)) :: reta

  call getpbh(BHN,Porg,Mass)
@@ -153,204 +145,174 @@
  dY = Y(2) - Y(1)
  dZ = Z(2) - Z(1)

-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    alpn1(i,j,k) = Lap(i,j,k) + ONE
-    chin1(i,j,k) = chi(i,j,k) + ONE
-    gxx(i,j,k) = dxx(i,j,k) + ONE
-    gyy(i,j,k) = dyy(i,j,k) + ONE
-    gzz(i,j,k) = dzz(i,j,k) + ONE
-  enddo
-  enddo
-  enddo
+  alpn1 = Lap + ONE
+  chin1 = chi + ONE
+  gxx = dxx + ONE
+  gyy = dyy + ONE
+  gzz = dzz + ONE

  call fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev)
  call fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev)
  call fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev)
  
-  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
+  div_beta = betaxx + betayy + betazz
+ 
+  call fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)

-  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
-  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
-  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
-  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
-
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    divb_loc = betaxx(i,j,k) + betayy(i,j,k) + betazz(i,j,k)
-    div_beta(i,j,k) = divb_loc
-
-    chi_rhs(i,j,k) = F2o3 * chin1(i,j,k) * (alpn1(i,j,k) * trK(i,j,k) - divb_loc)
-
-    gxx_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axx(i,j,k) - F2o3 * gxx(i,j,k) * divb_loc + &
-         TWO * ( gxx(i,j,k) * betaxx(i,j,k) + gxy(i,j,k) * betayx(i,j,k) + gxz(i,j,k) * betazx(i,j,k) )
-
-    gyy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayy(i,j,k) - F2o3 * gyy(i,j,k) * divb_loc + &
-         TWO * ( gxy(i,j,k) * betaxy(i,j,k) + gyy(i,j,k) * betayy(i,j,k) + gyz(i,j,k) * betazy(i,j,k) )
-
-    gzz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Azz(i,j,k) - F2o3 * gzz(i,j,k) * divb_loc + &
-         TWO * ( gxz(i,j,k) * betaxz(i,j,k) + gyz(i,j,k) * betayz(i,j,k) + gzz(i,j,k) * betazz(i,j,k) )
-
-    gxy_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axy(i,j,k) + F1o3 * gxy(i,j,k) * divb_loc + &
-         gxx(i,j,k) * betaxy(i,j,k) + gxz(i,j,k) * betazy(i,j,k) + gyy(i,j,k) * betayx(i,j,k) + &
-         gyz(i,j,k) * betazx(i,j,k) - gxy(i,j,k) * betazz(i,j,k)
-
-    gyz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Ayz(i,j,k) + F1o3 * gyz(i,j,k) * divb_loc + &
-         gxy(i,j,k) * betaxz(i,j,k) + gyy(i,j,k) * betayz(i,j,k) + gxz(i,j,k) * betaxy(i,j,k) + &
-         gzz(i,j,k) * betazy(i,j,k) - gyz(i,j,k) * betaxx(i,j,k)
-
-    gxz_rhs(i,j,k) = - TWO * alpn1(i,j,k) * Axz(i,j,k) + F1o3 * gxz(i,j,k) * divb_loc + &
-         gxx(i,j,k) * betaxz(i,j,k) + gxy(i,j,k) * betayz(i,j,k) + gyz(i,j,k) * betayx(i,j,k) + &
-         gzz(i,j,k) * betazx(i,j,k) - gxz(i,j,k) * betayy(i,j,k)
-
-    det_loc = gxx(i,j,k) * gyy(i,j,k) * gzz(i,j,k) + gxy(i,j,k) * gyz(i,j,k) * gxz(i,j,k) + &
-         gxz(i,j,k) * gxy(i,j,k) * gyz(i,j,k) - gxz(i,j,k) * gyy(i,j,k) * gxz(i,j,k) - &
-         gxy(i,j,k) * gxy(i,j,k) * gzz(i,j,k) - gxx(i,j,k) * gyz(i,j,k) * gyz(i,j,k)
-    gupxx_loc = ( gyy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gyz(i,j,k) ) / det_loc
-    gupxy_loc = - ( gxy(i,j,k) * gzz(i,j,k) - gyz(i,j,k) * gxz(i,j,k) ) / det_loc
-    gupxz_loc = ( gxy(i,j,k) * gyz(i,j,k) - gyy(i,j,k) * gxz(i,j,k) ) / det_loc
-    gupyy_loc = ( gxx(i,j,k) * gzz(i,j,k) - gxz(i,j,k) * gxz(i,j,k) ) / det_loc
-    gupyz_loc = - ( gxx(i,j,k) * gyz(i,j,k) - gxy(i,j,k) * gxz(i,j,k) ) / det_loc
-    gupzz_loc = ( gxx(i,j,k) * gyy(i,j,k) - gxy(i,j,k) * gxy(i,j,k) ) / det_loc
-    gupxx(i,j,k) = gupxx_loc
-    gupxy(i,j,k) = gupxy_loc
-    gupxz(i,j,k) = gupxz_loc
-    gupyy(i,j,k) = gupyy_loc
-    gupyz(i,j,k) = gupyz_loc
-    gupzz(i,j,k) = gupzz_loc
-
-    if(co == 0)then
-      Gmx_Res(i,j,k) = Gamx(i,j,k) - ( &
-           gupxx_loc*(gupxx_loc*gxxx(i,j,k)+gupxy_loc*gxyx(i,j,k)+gupxz_loc*gxzx(i,j,k)) + &
-           gupxy_loc*(gupxx_loc*gxyx(i,j,k)+gupxy_loc*gyyx(i,j,k)+gupxz_loc*gyzx(i,j,k)) + &
-           gupxz_loc*(gupxx_loc*gxzx(i,j,k)+gupxy_loc*gyzx(i,j,k)+gupxz_loc*gzzx(i,j,k)) + &
-           gupxx_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
-           gupxy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
-           gupxz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
-           gupxx_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
-           gupxy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
-           gupxz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
-      Gmy_Res(i,j,k) = Gamy(i,j,k) - ( &
-           gupxx_loc*(gupxy_loc*gxxx(i,j,k)+gupyy_loc*gxyx(i,j,k)+gupyz_loc*gxzx(i,j,k)) + &
-           gupxy_loc*(gupxy_loc*gxyx(i,j,k)+gupyy_loc*gyyx(i,j,k)+gupyz_loc*gyzx(i,j,k)) + &
-           gupxz_loc*(gupxy_loc*gxzx(i,j,k)+gupyy_loc*gyzx(i,j,k)+gupyz_loc*gzzx(i,j,k)) + &
-           gupxy_loc*(gupxy_loc*gxxy(i,j,k)+gupyy_loc*gxyy(i,j,k)+gupyz_loc*gxzy(i,j,k)) + &
-           gupyy_loc*(gupxy_loc*gxyy(i,j,k)+gupyy_loc*gyyy(i,j,k)+gupyz_loc*gyzy(i,j,k)) + &
-           gupyz_loc*(gupxy_loc*gxzy(i,j,k)+gupyy_loc*gyzy(i,j,k)+gupyz_loc*gzzy(i,j,k)) + &
-           gupxy_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
-           gupyy_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
-           gupyz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
-      Gmz_Res(i,j,k) = Gamz(i,j,k) - ( &
-           gupxx_loc*(gupxz_loc*gxxx(i,j,k)+gupyz_loc*gxyx(i,j,k)+gupzz_loc*gxzx(i,j,k)) + &
-           gupxy_loc*(gupxz_loc*gxyx(i,j,k)+gupyz_loc*gyyx(i,j,k)+gupzz_loc*gyzx(i,j,k)) + &
-           gupxz_loc*(gupxz_loc*gxzx(i,j,k)+gupyz_loc*gyzx(i,j,k)+gupzz_loc*gzzx(i,j,k)) + &
-           gupxy_loc*(gupxz_loc*gxxy(i,j,k)+gupyz_loc*gxyy(i,j,k)+gupzz_loc*gxzy(i,j,k)) + &
-           gupyy_loc*(gupxz_loc*gxyy(i,j,k)+gupyz_loc*gyyy(i,j,k)+gupzz_loc*gyzy(i,j,k)) + &
-           gupyz_loc*(gupxz_loc*gxzy(i,j,k)+gupyz_loc*gyzy(i,j,k)+gupzz_loc*gzzy(i,j,k)) + &
-           gupxz_loc*(gupxz_loc*gxxz(i,j,k)+gupyz_loc*gxyz(i,j,k)+gupzz_loc*gxzz(i,j,k)) + &
-           gupyz_loc*(gupxz_loc*gxyz(i,j,k)+gupyz_loc*gyyz(i,j,k)+gupzz_loc*gyzz(i,j,k)) + &
-           gupzz_loc*(gupxz_loc*gxzz(i,j,k)+gupyz_loc*gyzz(i,j,k)+gupzz_loc*gzzz(i,j,k)))
-    endif
-
-    Gamxxx(i,j,k)=HALF*( gupxx_loc*gxxx(i,j,k) + gupxy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupxz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
-    Gamyxx(i,j,k)=HALF*( gupxy_loc*gxxx(i,j,k) + gupyy_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupyz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
-    Gamzxx(i,j,k)=HALF*( gupxz_loc*gxxx(i,j,k) + gupyz_loc*(TWO*gxyx(i,j,k) - gxxy(i,j,k)) + gupzz_loc*(TWO*gxzx(i,j,k) - gxxz(i,j,k)))
-
-    Gamxyy(i,j,k)=HALF*( gupxx_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupxy_loc*gyyy(i,j,k) + gupxz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
-    Gamyyy(i,j,k)=HALF*( gupxy_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyy_loc*gyyy(i,j,k) + gupyz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
-    Gamzyy(i,j,k)=HALF*( gupxz_loc*(TWO*gxyy(i,j,k) - gyyx(i,j,k)) + gupyz_loc*gyyy(i,j,k) + gupzz_loc*(TWO*gyzy(i,j,k) - gyyz(i,j,k)))
-
-    Gamxzz(i,j,k)=HALF*( gupxx_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupxy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupxz_loc*gzzz(i,j,k))
-    Gamyzz(i,j,k)=HALF*( gupxy_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyy_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupyz_loc*gzzz(i,j,k))
-    Gamzzz(i,j,k)=HALF*( gupxz_loc*(TWO*gxzz(i,j,k) - gzzx(i,j,k)) + gupyz_loc*(TWO*gyzz(i,j,k) - gzzy(i,j,k)) + gupzz_loc*gzzz(i,j,k))
-
-    Gamxxy(i,j,k)=HALF*( gupxx_loc*gxxy(i,j,k) + gupxy_loc*gyyx(i,j,k) + gupxz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
-    Gamyxy(i,j,k)=HALF*( gupxy_loc*gxxy(i,j,k) + gupyy_loc*gyyx(i,j,k) + gupyz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
-    Gamzxy(i,j,k)=HALF*( gupxz_loc*gxxy(i,j,k) + gupyz_loc*gyyx(i,j,k) + gupzz_loc*(gxzy(i,j,k) + gyzx(i,j,k) - gxyz(i,j,k)) )
-
-    Gamxxz(i,j,k)=HALF*( gupxx_loc*gxxz(i,j,k) + gupxy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupxz_loc*gzzx(i,j,k) )
-    Gamyxz(i,j,k)=HALF*( gupxy_loc*gxxz(i,j,k) + gupyy_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupyz_loc*gzzx(i,j,k) )
-    Gamzxz(i,j,k)=HALF*( gupxz_loc*gxxz(i,j,k) + gupyz_loc*(gxyz(i,j,k) + gyzx(i,j,k) - gxzy(i,j,k)) + gupzz_loc*gzzx(i,j,k) )
-
-    Gamxyz(i,j,k)=HALF*( gupxx_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupxy_loc*gyyz(i,j,k) + gupxz_loc*gzzy(i,j,k) )
-    Gamyyz(i,j,k)=HALF*( gupxy_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyy_loc*gyyz(i,j,k) + gupyz_loc*gzzy(i,j,k) )
-    Gamzyz(i,j,k)=HALF*( gupxz_loc*(gxyz(i,j,k) + gxzy(i,j,k) - gyzx(i,j,k)) + gupyz_loc*gyyz(i,j,k) + gupzz_loc*gzzy(i,j,k) )
-  enddo
-  enddo
-  enddo
-! Raise indices of \tilde A_{ij} and store in R_ij
-
-! Right hand side for Gam^i without shift terms...
-  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
-  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    gupxx_loc = gupxx(i,j,k)
-    gupxy_loc = gupxy(i,j,k)
-    gupxz_loc = gupxz(i,j,k)
-    gupyy_loc = gupyy(i,j,k)
-    gupyz_loc = gupyz(i,j,k)
-    gupzz_loc = gupzz(i,j,k)
-
-    Rxx_loc = gupxx_loc * gupxx_loc * Axx(i,j,k) + gupxy_loc * gupxy_loc * Ayy(i,j,k) + gupxz_loc * gupxz_loc * Azz(i,j,k) + &
-         TWO * (gupxx_loc * gupxy_loc * Axy(i,j,k) + gupxx_loc * gupxz_loc * Axz(i,j,k) + gupxy_loc * gupxz_loc * Ayz(i,j,k))
-    Ryy_loc = gupxy_loc * gupxy_loc * Axx(i,j,k) + gupyy_loc * gupyy_loc * Ayy(i,j,k) + gupyz_loc * gupyz_loc * Azz(i,j,k) + &
-         TWO * (gupxy_loc * gupyy_loc * Axy(i,j,k) + gupxy_loc * gupyz_loc * Axz(i,j,k) + gupyy_loc * gupyz_loc * Ayz(i,j,k))
-    Rzz_loc = gupxz_loc * gupxz_loc * Axx(i,j,k) + gupyz_loc * gupyz_loc * Ayy(i,j,k) + gupzz_loc * gupzz_loc * Azz(i,j,k) + &
-         TWO * (gupxz_loc * gupyz_loc * Axy(i,j,k) + gupxz_loc * gupzz_loc * Axz(i,j,k) + gupyz_loc * gupzz_loc * Ayz(i,j,k))
-    Rxy_loc = gupxx_loc * gupxy_loc * Axx(i,j,k) + gupxy_loc * gupyy_loc * Ayy(i,j,k) + gupxz_loc * gupyz_loc * Azz(i,j,k) + &
-         (gupxx_loc * gupyy_loc + gupxy_loc * gupxy_loc) * Axy(i,j,k) + &
-         (gupxx_loc * gupyz_loc + gupxz_loc * gupxy_loc) * Axz(i,j,k) + &
-         (gupxy_loc * gupyz_loc + gupxz_loc * gupyy_loc) * Ayz(i,j,k)
-    Rxz_loc = gupxx_loc * gupxz_loc * Axx(i,j,k) + gupxy_loc * gupyz_loc * Ayy(i,j,k) + gupxz_loc * gupzz_loc * Azz(i,j,k) + &
-         (gupxx_loc * gupyz_loc + gupxy_loc * gupxz_loc) * Axy(i,j,k) + &
-         (gupxx_loc * gupzz_loc + gupxz_loc * gupxz_loc) * Axz(i,j,k) + &
-         (gupxy_loc * gupzz_loc + gupxz_loc * gupyz_loc) * Ayz(i,j,k)
-    Ryz_loc = gupxy_loc * gupxz_loc * Axx(i,j,k) + gupyy_loc * gupyz_loc * Ayy(i,j,k) + gupyz_loc * gupzz_loc * Azz(i,j,k) + &
-         (gupxy_loc * gupyz_loc + gupyy_loc * gupxz_loc) * Axy(i,j,k) + &
-         (gupxy_loc * gupzz_loc + gupyz_loc * gupxz_loc) * Axz(i,j,k) + &
-         (gupyy_loc * gupzz_loc + gupyz_loc * gupyz_loc) * Ayz(i,j,k)
-    Rxx(i,j,k) = Rxx_loc
-    Ryy(i,j,k) = Ryy_loc
-    Rzz(i,j,k) = Rzz_loc
-    Rxy(i,j,k) = Rxy_loc
-    Rxz(i,j,k) = Rxz_loc
-    Ryz(i,j,k) = Ryz_loc
-
-    Gamx_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxx_loc + Lapy(i,j,k) * Rxy_loc + Lapz(i,j,k) * Rxz_loc) + &
-         TWO * alpn1(i,j,k) * ( &
-         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxx_loc + chiy(i,j,k) * Rxy_loc + chiz(i,j,k) * Rxz_loc) - &
-         gupxx_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
-         gupxy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
-         gupxz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
-         Gamxxx(i,j,k) * Rxx_loc + Gamxyy(i,j,k) * Ryy_loc + Gamxzz(i,j,k) * Rzz_loc + &
-         TWO * (Gamxxy(i,j,k) * Rxy_loc + Gamxxz(i,j,k) * Rxz_loc + Gamxyz(i,j,k) * Ryz_loc))
-
-    Gamy_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxy_loc + Lapy(i,j,k) * Ryy_loc + Lapz(i,j,k) * Ryz_loc) + &
-         TWO * alpn1(i,j,k) * ( &
-         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxy_loc + chiy(i,j,k) * Ryy_loc + chiz(i,j,k) * Ryz_loc) - &
-         gupxy_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
-         gupyy_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
-         gupyz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
-         Gamyxx(i,j,k) * Rxx_loc + Gamyyy(i,j,k) * Ryy_loc + Gamyzz(i,j,k) * Rzz_loc + &
-         TWO * (Gamyxy(i,j,k) * Rxy_loc + Gamyxz(i,j,k) * Rxz_loc + Gamyyz(i,j,k) * Ryz_loc))
-
-    Gamz_rhs(i,j,k) = - TWO * (Lapx(i,j,k) * Rxz_loc + Lapy(i,j,k) * Ryz_loc + Lapz(i,j,k) * Rzz_loc) + &
-         TWO * alpn1(i,j,k) * ( &
-         -F3o2/chin1(i,j,k) * (chix(i,j,k) * Rxz_loc + chiy(i,j,k) * Ryz_loc + chiz(i,j,k) * Rzz_loc) - &
-         gupxz_loc * (F2o3 * Kx(i,j,k) + EIGHT * PI * Sx(i,j,k)) - &
-         gupyz_loc * (F2o3 * Ky(i,j,k) + EIGHT * PI * Sy(i,j,k)) - &
-         gupzz_loc * (F2o3 * Kz(i,j,k) + EIGHT * PI * Sz(i,j,k)) + &
-         Gamzxx(i,j,k) * Rxx_loc + Gamzyy(i,j,k) * Ryy_loc + Gamzzz(i,j,k) * Rzz_loc + &
-         TWO * (Gamzxy(i,j,k) * Rxy_loc + Gamzxz(i,j,k) * Rxz_loc + Gamzyz(i,j,k) * Ryz_loc))
-  enddo
-  enddo
-  enddo
+  chi_rhs = F2o3 *chin1*( alpn1 * trK - div_beta ) !rhs for chi
+
+  call fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev)
+  call fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev)
+  call fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev)
+  call fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev)
+
+  gxx_rhs = - TWO * alpn1 * Axx    -  F2o3 * gxx * div_beta          + &
+              TWO *(  gxx * betaxx +   gxy * betayx +   gxz * betazx)
+
+  gyy_rhs = - TWO * alpn1 * Ayy    -  F2o3 * gyy * div_beta          + &
+              TWO *(  gxy * betaxy +   gyy * betayy +   gyz * betazy)
+
+  gzz_rhs = - TWO * alpn1 * Azz    -  F2o3 * gzz * div_beta          + &
+              TWO *(  gxz * betaxz +   gyz * betayz +   gzz * betazz)
+
+  gxy_rhs = - TWO * alpn1 * Axy    +  F1o3 * gxy    * div_beta       + &
+                      gxx * betaxy                  +   gxz * betazy + &
+                                       gyy * betayx +   gyz * betazx   &
+                                                    -   gxy * betazz
+
+  gyz_rhs = - TWO * alpn1 * Ayz    +  F1o3 * gyz    * div_beta       + &
+                      gxy * betaxz +   gyy * betayz                  + &
+                      gxz * betaxy                  +   gzz * betazy   &
+                                                    -   gyz * betaxx
+ 
+  gxz_rhs = - TWO * alpn1 * Axz    +  F1o3 * gxz    * div_beta       + &
+                      gxx * betaxz +   gxy * betayz                  + &
+                                       gyz * betayx +   gzz * betazx   &
+                                                    -   gxz * betayy     !rhs for gij
+
+! invert tilted metric
+  gupzz =  gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz - &
+           gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz
+  gupxx =   ( gyy * gzz - gyz * gyz ) / gupzz
+  gupxy = - ( gxy * gzz - gyz * gxz ) / gupzz
+  gupxz =   ( gxy * gyz - gyy * gxz ) / gupzz
+  gupyy =   ( gxx * gzz - gxz * gxz ) / gupzz
+  gupyz = - ( gxx * gyz - gxy * gxz ) / gupzz
+  gupzz =   ( gxx * gyy - gxy * gxy ) / gupzz
+
+  if(co == 0)then
+! Gam^i_Res = Gam^i + gup^ij_,j
+  Gmx_Res = Gamx - (gupxx*(gupxx*gxxx+gupxy*gxyx+gupxz*gxzx)&
+                   +gupxy*(gupxx*gxyx+gupxy*gyyx+gupxz*gyzx)&
+                   +gupxz*(gupxx*gxzx+gupxy*gyzx+gupxz*gzzx)&
+                   +gupxx*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+                   +gupxy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+                   +gupxz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+                   +gupxx*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+                   +gupxy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+                   +gupxz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+  Gmy_Res = Gamy - (gupxx*(gupxy*gxxx+gupyy*gxyx+gupyz*gxzx)&
+                   +gupxy*(gupxy*gxyx+gupyy*gyyx+gupyz*gyzx)&
+                   +gupxz*(gupxy*gxzx+gupyy*gyzx+gupyz*gzzx)&
+                   +gupxy*(gupxy*gxxy+gupyy*gxyy+gupyz*gxzy)&
+                   +gupyy*(gupxy*gxyy+gupyy*gyyy+gupyz*gyzy)&
+                   +gupyz*(gupxy*gxzy+gupyy*gyzy+gupyz*gzzy)&
+                   +gupxy*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+                   +gupyy*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+                   +gupyz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+  Gmz_Res = Gamz - (gupxx*(gupxz*gxxx+gupyz*gxyx+gupzz*gxzx)&
+                   +gupxy*(gupxz*gxyx+gupyz*gyyx+gupzz*gyzx)&
+                   +gupxz*(gupxz*gxzx+gupyz*gyzx+gupzz*gzzx)&
+                   +gupxy*(gupxz*gxxy+gupyz*gxyy+gupzz*gxzy)&
+                   +gupyy*(gupxz*gxyy+gupyz*gyyy+gupzz*gyzy)&
+                   +gupyz*(gupxz*gxzy+gupyz*gyzy+gupzz*gzzy)&
+                   +gupxz*(gupxz*gxxz+gupyz*gxyz+gupzz*gxzz)&
+                   +gupyz*(gupxz*gxyz+gupyz*gyyz+gupzz*gyzz)&
+                   +gupzz*(gupxz*gxzz+gupyz*gyzz+gupzz*gzzz))
+  endif
+
+! second kind of connection
+  Gamxxx =HALF*( gupxx*gxxx + gupxy*(TWO*gxyx - gxxy ) + gupxz*(TWO*gxzx - gxxz ))
+  Gamyxx =HALF*( gupxy*gxxx + gupyy*(TWO*gxyx - gxxy ) + gupyz*(TWO*gxzx - gxxz ))
+  Gamzxx =HALF*( gupxz*gxxx + gupyz*(TWO*gxyx - gxxy ) + gupzz*(TWO*gxzx - gxxz ))
+ 
+  Gamxyy =HALF*( gupxx*(TWO*gxyy - gyyx ) + gupxy*gyyy + gupxz*(TWO*gyzy - gyyz ))
+  Gamyyy =HALF*( gupxy*(TWO*gxyy - gyyx ) + gupyy*gyyy + gupyz*(TWO*gyzy - gyyz ))
+  Gamzyy =HALF*( gupxz*(TWO*gxyy - gyyx ) + gupyz*gyyy + gupzz*(TWO*gyzy - gyyz ))
+
+  Gamxzz =HALF*( gupxx*(TWO*gxzz - gzzx ) + gupxy*(TWO*gyzz - gzzy ) + gupxz*gzzz)
+  Gamyzz =HALF*( gupxy*(TWO*gxzz - gzzx ) + gupyy*(TWO*gyzz - gzzy ) + gupyz*gzzz)
+  Gamzzz =HALF*( gupxz*(TWO*gxzz - gzzx ) + gupyz*(TWO*gyzz - gzzy ) + gupzz*gzzz)
+
+  Gamxxy =HALF*( gupxx*gxxy + gupxy*gyyx + gupxz*( gxzy + gyzx - gxyz ) )
+  Gamyxy =HALF*( gupxy*gxxy + gupyy*gyyx + gupyz*( gxzy + gyzx - gxyz ) )
+  Gamzxy =HALF*( gupxz*gxxy + gupyz*gyyx + gupzz*( gxzy + gyzx - gxyz ) )
+
+  Gamxxz =HALF*( gupxx*gxxz + gupxy*( gxyz + gyzx - gxzy ) + gupxz*gzzx )
+  Gamyxz =HALF*( gupxy*gxxz + gupyy*( gxyz + gyzx - gxzy ) + gupyz*gzzx )
+  Gamzxz =HALF*( gupxz*gxxz + gupyz*( gxyz + gyzx - gxzy ) + gupzz*gzzx )
+
+  Gamxyz =HALF*( gupxx*( gxyz + gxzy - gyzx ) + gupxy*gyyz + gupxz*gzzy )
+  Gamyyz =HALF*( gupxy*( gxyz + gxzy - gyzx ) + gupyy*gyyz + gupyz*gzzy )
+  Gamzyz =HALF*( gupxz*( gxyz + gxzy - gyzx ) + gupyz*gyyz + gupzz*gzzy )
+! Raise indices of \tilde A_{ij} and store in R_ij
+
+  Rxx =    gupxx * gupxx * Axx + gupxy * gupxy * Ayy + gupxz * gupxz * Azz + &
+      TWO*(gupxx * gupxy * Axy + gupxx * gupxz * Axz + gupxy * gupxz * Ayz)
+
+  Ryy =    gupxy * gupxy * Axx + gupyy * gupyy * Ayy + gupyz * gupyz * Azz + &
+      TWO*(gupxy * gupyy * Axy + gupxy * gupyz * Axz + gupyy * gupyz * Ayz)
+
+  Rzz =    gupxz * gupxz * Axx + gupyz * gupyz * Ayy + gupzz * gupzz * Azz + &
+      TWO*(gupxz * gupyz * Axy + gupxz * gupzz * Axz + gupyz * gupzz * Ayz)
+
+  Rxy =    gupxx * gupxy * Axx + gupxy * gupyy * Ayy + gupxz * gupyz * Azz + &
+          (gupxx * gupyy       + gupxy * gupxy)* Axy                       + &
+          (gupxx * gupyz       + gupxz * gupxy)* Axz                       + &
+          (gupxy * gupyz       + gupxz * gupyy)* Ayz
+
+  Rxz =    gupxx * gupxz * Axx + gupxy * gupyz * Ayy + gupxz * gupzz * Azz + &
+          (gupxx * gupyz       + gupxy * gupxz)* Axy                       + &
+          (gupxx * gupzz       + gupxz * gupxz)* Axz                       + &
+          (gupxy * gupzz       + gupxz * gupyz)* Ayz
+
+  Ryz =    gupxy * gupxz * Axx + gupyy * gupyz * Ayy + gupyz * gupzz * Azz + &
+          (gupxy * gupyz       + gupyy * gupxz)* Axy                       + &
+          (gupxy * gupzz       + gupyz * gupxz)* Axz                       + &
+          (gupyy * gupzz       + gupyz * gupyz)* Ayz
+
+! Right hand side for Gam^i without shift terms...
+  call fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
+  call fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,symmetry,Lev)
+
+   Gamx_rhs = - TWO * (   Lapx * Rxx +   Lapy * Rxy +   Lapz * Rxz ) + &
+        TWO * alpn1 * (                                                &
+        -F3o2/chin1 * (   chix * Rxx +   chiy * Rxy +   chiz * Rxz ) - &
+              gupxx * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+              gupxy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+              gupxz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+                        Gamxxx * Rxx + Gamxyy * Ryy + Gamxzz * Rzz   + &
+                TWO * ( Gamxxy * Rxy + Gamxxz * Rxz + Gamxyz * Ryz ) )
+
+   Gamy_rhs = - TWO * (   Lapx * Rxy +   Lapy * Ryy +   Lapz * Ryz ) + &
+        TWO * alpn1 * (                                                &
+        -F3o2/chin1 * (   chix * Rxy +  chiy * Ryy +    chiz * Ryz ) - &
+              gupxy * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+              gupyy * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+              gupyz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+                        Gamyxx * Rxx + Gamyyy * Ryy + Gamyzz * Rzz   + &
+                TWO * ( Gamyxy * Rxy + Gamyxz * Rxz + Gamyyz * Ryz ) )
+
+   Gamz_rhs = - TWO * (   Lapx * Rxz +   Lapy * Ryz +   Lapz * Rzz ) + &
+        TWO * alpn1 * (                                                &
+        -F3o2/chin1 * (   chix * Rxz +  chiy * Ryz +    chiz * Rzz ) - &
+              gupxz * (   F2o3 * Kx  +  EIGHT * PI * Sx            ) - &
+              gupyz * (   F2o3 * Ky  +  EIGHT * PI * Sy            ) - &
+              gupzz * (   F2o3 * Kz  +  EIGHT * PI * Sz            ) + &
+                        Gamzxx * Rxx + Gamzyy * Ryy + Gamzzz * Rzz   + &
+                TWO * ( Gamzxy * Rxy + Gamzxz * Rxz + Gamzyz * Ryz ) )

  call fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,&
                X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev)
@@ -359,54 +321,38 @@
  call fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,&
                X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev)

-  call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
-  call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
-  call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    divb_loc = div_beta(i,j,k)
-    fxx_loc = gxxx(i,j,k) + gxyy(i,j,k) + gxzz(i,j,k)
-    fxy_loc = gxyx(i,j,k) + gyyy(i,j,k) + gyzz(i,j,k)
-    fxz_loc = gxzx(i,j,k) + gyzy(i,j,k) + gzzz(i,j,k)
-
-    gupxx_loc = gupxx(i,j,k)
-    gupxy_loc = gupxy(i,j,k)
-    gupxz_loc = gupxz(i,j,k)
-    gupyy_loc = gupyy(i,j,k)
-    gupyz_loc = gupyz(i,j,k)
-    gupzz_loc = gupzz(i,j,k)
-
-    Gamxa_loc = gupxx_loc * Gamxxx(i,j,k) + gupyy_loc * Gamxyy(i,j,k) + gupzz_loc * Gamxzz(i,j,k) + &
-         TWO * (gupxy_loc * Gamxxy(i,j,k) + gupxz_loc * Gamxxz(i,j,k) + gupyz_loc * Gamxyz(i,j,k))
-    Gamya_loc = gupxx_loc * Gamyxx(i,j,k) + gupyy_loc * Gamyyy(i,j,k) + gupzz_loc * Gamyzz(i,j,k) + &
-         TWO * (gupxy_loc * Gamyxy(i,j,k) + gupxz_loc * Gamyxz(i,j,k) + gupyz_loc * Gamyyz(i,j,k))
-    Gamza_loc = gupxx_loc * Gamzxx(i,j,k) + gupyy_loc * Gamzyy(i,j,k) + gupzz_loc * Gamzzz(i,j,k) + &
-         TWO * (gupxy_loc * Gamzxy(i,j,k) + gupxz_loc * Gamzxz(i,j,k) + gupyz_loc * Gamzyz(i,j,k))
-    Gamxa(i,j,k) = Gamxa_loc
-    Gamya(i,j,k) = Gamya_loc
-    Gamza(i,j,k) = Gamza_loc
-
-    Gamx_rhs(i,j,k) = Gamx_rhs(i,j,k) + F2o3 * Gamxa_loc * divb_loc - &
-         Gamxa_loc * betaxx(i,j,k) - Gamya_loc * betaxy(i,j,k) - Gamza_loc * betaxz(i,j,k) + &
-         F1o3 * (gupxx_loc * fxx_loc + gupxy_loc * fxy_loc + gupxz_loc * fxz_loc) + &
-         gupxx_loc * gxxx(i,j,k) + gupyy_loc * gyyx(i,j,k) + gupzz_loc * gzzx(i,j,k) + &
-         TWO * (gupxy_loc * gxyx(i,j,k) + gupxz_loc * gxzx(i,j,k) + gupyz_loc * gyzx(i,j,k))
-
-    Gamy_rhs(i,j,k) = Gamy_rhs(i,j,k) + F2o3 * Gamya_loc * divb_loc - &
-         Gamxa_loc * betayx(i,j,k) - Gamya_loc * betayy(i,j,k) - Gamza_loc * betayz(i,j,k) + &
-         F1o3 * (gupxy_loc * fxx_loc + gupyy_loc * fxy_loc + gupyz_loc * fxz_loc) + &
-         gupxx_loc * gxxy(i,j,k) + gupyy_loc * gyyy(i,j,k) + gupzz_loc * gzzy(i,j,k) + &
-         TWO * (gupxy_loc * gxyy(i,j,k) + gupxz_loc * gxzy(i,j,k) + gupyz_loc * gyzy(i,j,k))
-
-    Gamz_rhs(i,j,k) = Gamz_rhs(i,j,k) + F2o3 * Gamza_loc * divb_loc - &
-         Gamxa_loc * betazx(i,j,k) - Gamya_loc * betazy(i,j,k) - Gamza_loc * betazz(i,j,k) + &
-         F1o3 * (gupxz_loc * fxx_loc + gupyz_loc * fxy_loc + gupzz_loc * fxz_loc) + &
-         gupxx_loc * gxxz(i,j,k) + gupyy_loc * gyyz(i,j,k) + gupzz_loc * gzzz(i,j,k) + &
-         TWO * (gupxy_loc * gxyz(i,j,k) + gupxz_loc * gxzz(i,j,k) + gupyz_loc * gyzz(i,j,k))
-  enddo
-  enddo
-  enddo
+  fxx = gxxx + gxyy + gxzz
+  fxy = gxyx + gyyy + gyzz
+  fxz = gxzx + gyzy + gzzz
+
+  Gamxa =       gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz + &
+          TWO*( gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz )
+  Gamya =       gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz + &
+          TWO*( gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz )
+  Gamza =       gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz + &
+          TWO*( gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz )
+
+  call fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev)
+  call fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev)
+  call fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev)
+
+  Gamx_rhs =               Gamx_rhs +  F2o3 *  Gamxa * div_beta        - &
+                     Gamxa * betaxx - Gamya * betaxy - Gamza * betaxz  + &
+             F1o3 * (gupxx * fxx    + gupxy * fxy    + gupxz * fxz    ) + &
+                     gupxx * gxxx   + gupyy * gyyx   + gupzz * gzzx    + &
+              TWO * (gupxy * gxyx   + gupxz * gxzx   + gupyz * gyzx  )
+
+  Gamy_rhs =               Gamy_rhs +  F2o3 *  Gamya * div_beta        - &
+                     Gamxa * betayx - Gamya * betayy - Gamza * betayz  + &
+             F1o3 * (gupxy * fxx    + gupyy * fxy    + gupyz * fxz    ) + &
+                     gupxx * gxxy   + gupyy * gyyy   + gupzz * gzzy    + &
+              TWO * (gupxy * gxyy   + gupxz * gxzy   + gupyz * gyzy  )
+
+  Gamz_rhs =               Gamz_rhs +  F2o3 *  Gamza * div_beta        - &
+                     Gamxa * betazx - Gamya * betazy - Gamza * betazz  + &
+             F1o3 * (gupxz * fxx    + gupyz * fxy    + gupzz * fxz    ) + &
+                     gupxx * gxxz   + gupyy * gyyz   + gupzz * gzzz    + &
+              TWO * (gupxy * gxyz   + gupxz * gxzz   + gupyz * gyzz  )    !rhs for Gam^i

 !first kind of connection stored in gij,k
  gxxx = gxx * Gamxxx + gxy * Gamyxx + gxz * Gamzxx
@@ -655,190 +601,192 @@
            Gamxyz * gxzz + Gamyyz * gyzz + Gamzyz * gzzz  + &
            Gamxzz * gxzy + Gamyzz * gyzy + Gamzzz * gzzy  + &
            Gamxyz * gzzx + Gamyyz * gzzy + Gamzyz * gzzz )
-!covariant second derivative of chi respect to tilted metric
-  call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
-
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k) * chix(i,j,k) - Gamyxx(i,j,k) * chiy(i,j,k) - Gamzxx(i,j,k) * chiz(i,j,k)
-    fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k) * chix(i,j,k) - Gamyxy(i,j,k) * chiy(i,j,k) - Gamzxy(i,j,k) * chiz(i,j,k)
-    fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k) * chix(i,j,k) - Gamyxz(i,j,k) * chiy(i,j,k) - Gamzxz(i,j,k) * chiz(i,j,k)
-    fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k) * chix(i,j,k) - Gamyyy(i,j,k) * chiy(i,j,k) - Gamzyy(i,j,k) * chiz(i,j,k)
-    fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k) * chix(i,j,k) - Gamyyz(i,j,k) * chiy(i,j,k) - Gamzyz(i,j,k) * chiz(i,j,k)
-    fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k) * chix(i,j,k) - Gamyzz(i,j,k) * chiy(i,j,k) - Gamzzz(i,j,k) * chiz(i,j,k)
-
-    chin_loc = chin1(i,j,k)
-    f_loc = gupxx(i,j,k) * (fxx(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chix(i,j,k)) + &
-            gupyy(i,j,k) * (fyy(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiy(i,j,k)) + &
-            gupzz(i,j,k) * (fzz(i,j,k) - F3o2/chin_loc * chiz(i,j,k) * chiz(i,j,k)) + &
-            TWO * gupxy(i,j,k) * (fxy(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiy(i,j,k)) + &
-            TWO * gupxz(i,j,k) * (fxz(i,j,k) - F3o2/chin_loc * chix(i,j,k) * chiz(i,j,k)) + &
-            TWO * gupyz(i,j,k) * (fyz(i,j,k) - F3o2/chin_loc * chiy(i,j,k) * chiz(i,j,k))
-    f(i,j,k) = f_loc
-
-    Rxx(i,j,k) = Rxx(i,j,k) + (fxx(i,j,k) - chix(i,j,k)*chix(i,j,k)/chin_loc/TWO + gxx(i,j,k) * f_loc)/chin_loc/TWO
-    Ryy(i,j,k) = Ryy(i,j,k) + (fyy(i,j,k) - chiy(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gyy(i,j,k) * f_loc)/chin_loc/TWO
-    Rzz(i,j,k) = Rzz(i,j,k) + (fzz(i,j,k) - chiz(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gzz(i,j,k) * f_loc)/chin_loc/TWO
-    Rxy(i,j,k) = Rxy(i,j,k) + (fxy(i,j,k) - chix(i,j,k)*chiy(i,j,k)/chin_loc/TWO + gxy(i,j,k) * f_loc)/chin_loc/TWO
-    Rxz(i,j,k) = Rxz(i,j,k) + (fxz(i,j,k) - chix(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gxz(i,j,k) * f_loc)/chin_loc/TWO
-    Ryz(i,j,k) = Ryz(i,j,k) + (fyz(i,j,k) - chiy(i,j,k)*chiz(i,j,k)/chin_loc/TWO + gyz(i,j,k) * f_loc)/chin_loc/TWO
-  enddo
-  enddo
-  enddo
-
-! covariant second derivatives of the lapse respect to physical metric
-  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
-                SYM,SYM,SYM,symmetry,Lev)
-
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    chin_loc = chin1(i,j,k)
-    gxxx(i,j,k) = (gupxx(i,j,k) * chix(i,j,k) + gupxy(i,j,k) * chiy(i,j,k) + gupxz(i,j,k) * chiz(i,j,k)) / chin_loc
-    gxxy(i,j,k) = (gupxy(i,j,k) * chix(i,j,k) + gupyy(i,j,k) * chiy(i,j,k) + gupyz(i,j,k) * chiz(i,j,k)) / chin_loc
-    gxxz(i,j,k) = (gupxz(i,j,k) * chix(i,j,k) + gupyz(i,j,k) * chiy(i,j,k) + gupzz(i,j,k) * chiz(i,j,k)) / chin_loc
-
-    Gamxxx(i,j,k) = Gamxxx(i,j,k) - ( (chix(i,j,k) + chix(i,j,k))/chin_loc - gxx(i,j,k) * gxxx(i,j,k) )*HALF
-    Gamyxx(i,j,k) = Gamyxx(i,j,k) - (                                   - gxx(i,j,k) * gxxy(i,j,k) )*HALF
-    Gamzxx(i,j,k) = Gamzxx(i,j,k) - (                                   - gxx(i,j,k) * gxxz(i,j,k) )*HALF
-    Gamxyy(i,j,k) = Gamxyy(i,j,k) - (                                   - gyy(i,j,k) * gxxx(i,j,k) )*HALF
-    Gamyyy(i,j,k) = Gamyyy(i,j,k) - ( (chiy(i,j,k) + chiy(i,j,k))/chin_loc - gyy(i,j,k) * gxxy(i,j,k) )*HALF
-    Gamzyy(i,j,k) = Gamzyy(i,j,k) - (                                   - gyy(i,j,k) * gxxz(i,j,k) )*HALF
-    Gamxzz(i,j,k) = Gamxzz(i,j,k) - (                                   - gzz(i,j,k) * gxxx(i,j,k) )*HALF
-    Gamyzz(i,j,k) = Gamyzz(i,j,k) - (                                   - gzz(i,j,k) * gxxy(i,j,k) )*HALF
-    Gamzzz(i,j,k) = Gamzzz(i,j,k) - ( (chiz(i,j,k) + chiz(i,j,k))/chin_loc - gzz(i,j,k) * gxxz(i,j,k) )*HALF
-    Gamxxy(i,j,k) = Gamxxy(i,j,k) - ( chiy(i,j,k) /chin_loc - gxy(i,j,k) * gxxx(i,j,k) )*HALF
-    Gamyxy(i,j,k) = Gamyxy(i,j,k) - ( chix(i,j,k) /chin_loc - gxy(i,j,k) * gxxy(i,j,k) )*HALF
-    Gamzxy(i,j,k) = Gamzxy(i,j,k) - (                     - gxy(i,j,k) * gxxz(i,j,k) )*HALF
-    Gamxxz(i,j,k) = Gamxxz(i,j,k) - ( chiz(i,j,k) /chin_loc - gxz(i,j,k) * gxxx(i,j,k) )*HALF
-    Gamyxz(i,j,k) = Gamyxz(i,j,k) - (                     - gxz(i,j,k) * gxxy(i,j,k) )*HALF
-    Gamzxz(i,j,k) = Gamzxz(i,j,k) - ( chix(i,j,k) /chin_loc - gxz(i,j,k) * gxxz(i,j,k) )*HALF
-    Gamxyz(i,j,k) = Gamxyz(i,j,k) - (                     - gyz(i,j,k) * gxxx(i,j,k) )*HALF
-    Gamyyz(i,j,k) = Gamyyz(i,j,k) - ( chiz(i,j,k) /chin_loc - gyz(i,j,k) * gxxy(i,j,k) )*HALF
-    Gamzyz(i,j,k) = Gamzyz(i,j,k) - ( chiy(i,j,k) /chin_loc - gyz(i,j,k) * gxxz(i,j,k) )*HALF
-
-    fxx(i,j,k) = fxx(i,j,k) - Gamxxx(i,j,k)*Lapx(i,j,k) - Gamyxx(i,j,k)*Lapy(i,j,k) - Gamzxx(i,j,k)*Lapz(i,j,k)
-    fyy(i,j,k) = fyy(i,j,k) - Gamxyy(i,j,k)*Lapx(i,j,k) - Gamyyy(i,j,k)*Lapy(i,j,k) - Gamzyy(i,j,k)*Lapz(i,j,k)
-    fzz(i,j,k) = fzz(i,j,k) - Gamxzz(i,j,k)*Lapx(i,j,k) - Gamyzz(i,j,k)*Lapy(i,j,k) - Gamzzz(i,j,k)*Lapz(i,j,k)
-    fxy(i,j,k) = fxy(i,j,k) - Gamxxy(i,j,k)*Lapx(i,j,k) - Gamyxy(i,j,k)*Lapy(i,j,k) - Gamzxy(i,j,k)*Lapz(i,j,k)
-    fxz(i,j,k) = fxz(i,j,k) - Gamxxz(i,j,k)*Lapx(i,j,k) - Gamyxz(i,j,k)*Lapy(i,j,k) - Gamzxz(i,j,k)*Lapz(i,j,k)
-    fyz(i,j,k) = fyz(i,j,k) - Gamxyz(i,j,k)*Lapx(i,j,k) - Gamyyz(i,j,k)*Lapy(i,j,k) - Gamzyz(i,j,k)*Lapz(i,j,k)
-
-    trK_rhs(i,j,k) = gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
-                     TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k))
-  enddo
-  enddo
-  enddo
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-    divb_loc = div_beta(i,j,k)
-    chin_loc = chin1(i,j,k)
-
-    S_loc = chin_loc * ( gupxx(i,j,k) * Sxx(i,j,k) + gupyy(i,j,k) * Syy(i,j,k) + gupzz(i,j,k) * Szz(i,j,k) + &
-           TWO * (gupxy(i,j,k) * Sxy(i,j,k) + gupxz(i,j,k) * Sxz(i,j,k) + gupyz(i,j,k) * Syz(i,j,k)) )
-    S(i,j,k) = S_loc
-
-    f_loc = F2o3 * trK(i,j,k) * trK(i,j,k) - ( &
-            gupxx(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
-                             gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + &
-                             TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + &
-                                    gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k)) ) + &
-            gupyy(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
-                             gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
-                             TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
-                                    gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k)) ) + &
-            gupzz(i,j,k) * ( gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
-                             gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + &
-                             TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
-                                    gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k)) ) + &
-            TWO * ( gupxy(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
-                                     gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
-                                     gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
-                                     gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
-                                     gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k)) ) + &
-                    gupxz(i,j,k) * ( gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
-                                     gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + &
-                                     gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
-                                     gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
-                                     gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k)) ) + &
-                    gupyz(i,j,k) * ( gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
-                                     gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + &
-                                     gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
-                                     gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
-                                     gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k)) ) ) ) - &
-            F16 * PI * rho(i,j,k) + EIGHT * PI * S_loc
-
-    f_loc = -F1o3 * ( gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
-            TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
-            alpn1(i,j,k)/chin_loc * f_loc )
-    f(i,j,k) = f_loc
-
-    l_fxx = alpn1(i,j,k) * (Rxx(i,j,k) - EIGHT * PI * Sxx(i,j,k)) - fxx(i,j,k)
-    l_fxy = alpn1(i,j,k) * (Rxy(i,j,k) - EIGHT * PI * Sxy(i,j,k)) - fxy(i,j,k)
-    l_fxz = alpn1(i,j,k) * (Rxz(i,j,k) - EIGHT * PI * Sxz(i,j,k)) - fxz(i,j,k)
-    l_fyy = alpn1(i,j,k) * (Ryy(i,j,k) - EIGHT * PI * Syy(i,j,k)) - fyy(i,j,k)
-    l_fyz = alpn1(i,j,k) * (Ryz(i,j,k) - EIGHT * PI * Syz(i,j,k)) - fyz(i,j,k)
-    l_fzz = alpn1(i,j,k) * (Rzz(i,j,k) - EIGHT * PI * Szz(i,j,k)) - fzz(i,j,k)
-
-    Axx_rhs(i,j,k) = l_fxx - gxx(i,j,k) * f_loc
-    Ayy_rhs(i,j,k) = l_fyy - gyy(i,j,k) * f_loc
-    Azz_rhs(i,j,k) = l_fzz - gzz(i,j,k) * f_loc
-    Axy_rhs(i,j,k) = l_fxy - gxy(i,j,k) * f_loc
-    Axz_rhs(i,j,k) = l_fxz - gxz(i,j,k) * f_loc
-    Ayz_rhs(i,j,k) = l_fyz - gyz(i,j,k) * f_loc
-
-    fxx(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axx(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + &
-                 gupzz(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + TWO * (gupxy(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + &
-                 gupxz(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyz(i,j,k) * Axy(i,j,k) * Axz(i,j,k))
-    fyy(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayy(i,j,k) + &
-                 gupzz(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + TWO * (gupxy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
-                 gupxz(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + gupyz(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k))
-    fzz(i,j,k) = gupxx(i,j,k) * Axz(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayz(i,j,k) * Ayz(i,j,k) + &
-                 gupzz(i,j,k) * Azz(i,j,k) * Azz(i,j,k) + TWO * (gupxy(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + &
-                 gupxz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupyz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k))
-    fxy(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axy(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayy(i,j,k) + &
-                 gupzz(i,j,k) * Axz(i,j,k) * Ayz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayy(i,j,k) + Axy(i,j,k) * Axy(i,j,k)) + &
-                 gupxz(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Axy(i,j,k)) + &
-                 gupyz(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Axz(i,j,k) * Ayy(i,j,k))
-    fxz(i,j,k) = gupxx(i,j,k) * Axx(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Axy(i,j,k) * Ayz(i,j,k) + &
-                 gupzz(i,j,k) * Axz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axx(i,j,k) * Ayz(i,j,k) + Axy(i,j,k) * Axz(i,j,k)) + &
-                 gupxz(i,j,k) * (Axx(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Axz(i,j,k)) + &
-                 gupyz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Axz(i,j,k) * Ayz(i,j,k))
-    fyz(i,j,k) = gupxx(i,j,k) * Axy(i,j,k) * Axz(i,j,k) + gupyy(i,j,k) * Ayy(i,j,k) * Ayz(i,j,k) + &
-                 gupzz(i,j,k) * Ayz(i,j,k) * Azz(i,j,k) + gupxy(i,j,k) * (Axy(i,j,k) * Ayz(i,j,k) + Ayy(i,j,k) * Axz(i,j,k)) + &
-                 gupxz(i,j,k) * (Axy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Axz(i,j,k)) + &
-                 gupyz(i,j,k) * (Ayy(i,j,k) * Azz(i,j,k) + Ayz(i,j,k) * Ayz(i,j,k))
-
-    trK_rhs(i,j,k) = chin_loc * trK_rhs(i,j,k)
-
-    Axx_rhs(i,j,k) = chin_loc * Axx_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axx(i,j,k) - TWO * fxx(i,j,k)) + &
-                     TWO * (Axx(i,j,k) * betaxx(i,j,k) + Axy(i,j,k) * betayx(i,j,k) + Axz(i,j,k) * betazx(i,j,k)) - &
-                     F2o3 * Axx(i,j,k) * divb_loc
-    Ayy_rhs(i,j,k) = chin_loc * Ayy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayy(i,j,k) - TWO * fyy(i,j,k)) + &
-                     TWO * (Axy(i,j,k) * betaxy(i,j,k) + Ayy(i,j,k) * betayy(i,j,k) + Ayz(i,j,k) * betazy(i,j,k)) - &
-                     F2o3 * Ayy(i,j,k) * divb_loc
-    Azz_rhs(i,j,k) = chin_loc * Azz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Azz(i,j,k) - TWO * fzz(i,j,k)) + &
-                     TWO * (Axz(i,j,k) * betaxz(i,j,k) + Ayz(i,j,k) * betayz(i,j,k) + Azz(i,j,k) * betazz(i,j,k)) - &
-                     F2o3 * Azz(i,j,k) * divb_loc
-    Axy_rhs(i,j,k) = chin_loc * Axy_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axy(i,j,k) - TWO * fxy(i,j,k)) + &
-                     Axx(i,j,k) * betaxy(i,j,k) + Axz(i,j,k) * betazy(i,j,k) + Ayy(i,j,k) * betayx(i,j,k) + &
-                     Ayz(i,j,k) * betazx(i,j,k) + F1o3 * Axy(i,j,k) * divb_loc - Axy(i,j,k) * betazz(i,j,k)
-    Ayz_rhs(i,j,k) = chin_loc * Ayz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Ayz(i,j,k) - TWO * fyz(i,j,k)) + &
-                     Axy(i,j,k) * betaxz(i,j,k) + Ayy(i,j,k) * betayz(i,j,k) + Axz(i,j,k) * betaxy(i,j,k) + &
-                     Azz(i,j,k) * betazy(i,j,k) + F1o3 * Ayz(i,j,k) * divb_loc - Ayz(i,j,k) * betaxx(i,j,k)
-    Axz_rhs(i,j,k) = chin_loc * Axz_rhs(i,j,k) + alpn1(i,j,k) * (trK(i,j,k) * Axz(i,j,k) - TWO * fxz(i,j,k)) + &
-                     Axx(i,j,k) * betaxz(i,j,k) + Axy(i,j,k) * betayz(i,j,k) + Ayz(i,j,k) * betayx(i,j,k) + &
-                     Azz(i,j,k) * betazx(i,j,k) + F1o3 * Axz(i,j,k) * divb_loc - Axz(i,j,k) * betayy(i,j,k)
-
-    trK_rhs(i,j,k) = - trK_rhs(i,j,k) + alpn1(i,j,k) * ( F1o3 * trK(i,j,k) * trK(i,j,k) + &
-                    gupxx(i,j,k) * fxx(i,j,k) + gupyy(i,j,k) * fyy(i,j,k) + gupzz(i,j,k) * fzz(i,j,k) + &
-                    TWO * (gupxy(i,j,k) * fxy(i,j,k) + gupxz(i,j,k) * fxz(i,j,k) + gupyz(i,j,k) * fyz(i,j,k)) + &
-                    FOUR * PI * (rho(i,j,k) + S_loc) )
-  enddo
-  enddo
-  enddo
+!covariant second derivative of chi respect to tilted metric
+  call fdderivs(ex,chi,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev)
+
+  fxx = fxx - Gamxxx * chix - Gamyxx * chiy - Gamzxx * chiz
+  fxy = fxy - Gamxxy * chix - Gamyxy * chiy - Gamzxy * chiz
+  fxz = fxz - Gamxxz * chix - Gamyxz * chiy - Gamzxz * chiz
+  fyy = fyy - Gamxyy * chix - Gamyyy * chiy - Gamzyy * chiz
+  fyz = fyz - Gamxyz * chix - Gamyyz * chiy - Gamzyz * chiz
+  fzz = fzz - Gamxzz * chix - Gamyzz * chiy - Gamzzz * chiz
+! Store D^l D_l chi - 3/(2*chi) D^l chi D_l chi in f
+
+  f =        gupxx * ( fxx - F3o2/chin1 * chix * chix ) + &
+             gupyy * ( fyy - F3o2/chin1 * chiy * chiy ) + &
+             gupzz * ( fzz - F3o2/chin1 * chiz * chiz ) + &
+       TWO * gupxy * ( fxy - F3o2/chin1 * chix * chiy ) + &
+       TWO * gupxz * ( fxz - F3o2/chin1 * chix * chiz ) + &
+       TWO * gupyz * ( fyz - F3o2/chin1 * chiy * chiz ) 
+! Add chi part to Ricci tensor:
+
+  Rxx = Rxx + (fxx - chix*chix/chin1/TWO + gxx * f)/chin1/TWO
+  Ryy = Ryy + (fyy - chiy*chiy/chin1/TWO + gyy * f)/chin1/TWO
+  Rzz = Rzz + (fzz - chiz*chiz/chin1/TWO + gzz * f)/chin1/TWO
+  Rxy = Rxy + (fxy - chix*chiy/chin1/TWO + gxy * f)/chin1/TWO
+  Rxz = Rxz + (fxz - chix*chiz/chin1/TWO + gxz * f)/chin1/TWO
+  Ryz = Ryz + (fyz - chiy*chiz/chin1/TWO + gyz * f)/chin1/TWO
+
+! covariant second derivatives of the lapse respect to physical metric
+  call fdderivs(ex,Lap,fxx,fxy,fxz,fyy,fyz,fzz,X,Y,Z, &
+                SYM,SYM,SYM,symmetry,Lev)
+
+  gxxx = (gupxx * chix + gupxy * chiy + gupxz * chiz)/chin1
+  gxxy = (gupxy * chix + gupyy * chiy + gupyz * chiz)/chin1
+  gxxz = (gupxz * chix + gupyz * chiy + gupzz * chiz)/chin1
+! now get physical second kind of connection
+  Gamxxx = Gamxxx - ( (chix + chix)/chin1 - gxx * gxxx )*HALF
+  Gamyxx = Gamyxx - (                     - gxx * gxxy )*HALF
+  Gamzxx = Gamzxx - (                     - gxx * gxxz )*HALF
+  Gamxyy = Gamxyy - (                     - gyy * gxxx )*HALF
+  Gamyyy = Gamyyy - ( (chiy + chiy)/chin1 - gyy * gxxy )*HALF
+  Gamzyy = Gamzyy - (                     - gyy * gxxz )*HALF
+  Gamxzz = Gamxzz - (                     - gzz * gxxx )*HALF
+  Gamyzz = Gamyzz - (                     - gzz * gxxy )*HALF
+  Gamzzz = Gamzzz - ( (chiz + chiz)/chin1 - gzz * gxxz )*HALF
+  Gamxxy = Gamxxy - (  chiy        /chin1 - gxy * gxxx )*HALF
+  Gamyxy = Gamyxy - (         chix /chin1 - gxy * gxxy )*HALF
+  Gamzxy = Gamzxy - (                     - gxy * gxxz )*HALF
+  Gamxxz = Gamxxz - (  chiz        /chin1 - gxz * gxxx )*HALF
+  Gamyxz = Gamyxz - (                     - gxz * gxxy )*HALF
+  Gamzxz = Gamzxz - (         chix /chin1 - gxz * gxxz )*HALF
+  Gamxyz = Gamxyz - (                     - gyz * gxxx )*HALF
+  Gamyyz = Gamyyz - (  chiz        /chin1 - gyz * gxxy )*HALF
+  Gamzyz = Gamzyz - (         chiy /chin1 - gyz * gxxz )*HALF
+
+  fxx = fxx - Gamxxx*Lapx - Gamyxx*Lapy - Gamzxx*Lapz
+  fyy = fyy - Gamxyy*Lapx - Gamyyy*Lapy - Gamzyy*Lapz
+  fzz = fzz - Gamxzz*Lapx - Gamyzz*Lapy - Gamzzz*Lapz
+  fxy = fxy - Gamxxy*Lapx - Gamyxy*Lapy - Gamzxy*Lapz
+  fxz = fxz - Gamxxz*Lapx - Gamyxz*Lapy - Gamzxz*Lapz
+  fyz = fyz - Gamxyz*Lapx - Gamyyz*Lapy - Gamzyz*Lapz
+
+! store D^i D_i Lap in trK_rhs upto chi
+  trK_rhs =    gupxx * fxx + gupyy * fyy + gupzz * fzz + &
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz )
+#if 1        
+!! follow bam code
+  S =  chin1 * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
+     TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
+  f = F2o3 * trK * trK -(&
+       gupxx * ( &
+       gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
+       TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz) ) + &
+       gupyy * ( &
+       gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
+       TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz) ) + &
+       gupzz * ( &
+       gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
+       TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz) ) + &
+       TWO * ( &
+       gupxy * ( &
+       gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
+       gupxy * (Axx * Ayy + Axy * Axy) + &
+       gupxz * (Axx * Ayz + Axz * Axy) + &
+       gupyz * (Axy * Ayz + Axz * Ayy) ) + &
+       gupxz * ( &
+       gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
+       gupxy * (Axx * Ayz + Axy * Axz) + &
+       gupxz * (Axx * Azz + Axz * Axz) + &
+       gupyz * (Axy * Azz + Axz * Ayz) ) + &
+       gupyz * ( &
+       gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
+       gupxy * (Axy * Ayz + Ayy * Axz) + &
+       gupxz * (Axy * Azz + Ayz * Axz) + &
+       gupyz * (Ayy * Azz + Ayz * Ayz) ) )) -1.6d1*PI*rho + EIGHT * PI * S
+  f = - F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + alpn1/chin1*f)
+  
+  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
+  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
+  fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
+  fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
+  fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
+  fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
+#else        
+! Add lapse and S_ij parts to Ricci tensor:
+
+  fxx = alpn1 * (Rxx - EIGHT * PI * Sxx) - fxx
+  fxy = alpn1 * (Rxy - EIGHT * PI * Sxy) - fxy
+  fxz = alpn1 * (Rxz - EIGHT * PI * Sxz) - fxz
+  fyy = alpn1 * (Ryy - EIGHT * PI * Syy) - fyy
+  fyz = alpn1 * (Ryz - EIGHT * PI * Syz) - fyz
+  fzz = alpn1 * (Rzz - EIGHT * PI * Szz) - fzz
+
+! Compute trace-free part (note: chi^-1 and chi cancel!):
+
+  f = F1o3 *(  gupxx * fxx + gupyy * fyy + gupzz * fzz + &
+        TWO* ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) )
+#endif
+
+  Axx_rhs = fxx - gxx * f
+  Ayy_rhs = fyy - gyy * f
+  Azz_rhs = fzz - gzz * f
+  Axy_rhs = fxy - gxy * f
+  Axz_rhs = fxz - gxz * f
+  Ayz_rhs = fyz - gyz * f
+
+! Now: store A_il A^l_j into fij:
+
+  fxx =       gupxx * Axx * Axx + gupyy * Axy * Axy + gupzz * Axz * Axz + &
+       TWO * (gupxy * Axx * Axy + gupxz * Axx * Axz + gupyz * Axy * Axz)
+  fyy =       gupxx * Axy * Axy + gupyy * Ayy * Ayy + gupzz * Ayz * Ayz + &
+       TWO * (gupxy * Axy * Ayy + gupxz * Axy * Ayz + gupyz * Ayy * Ayz)
+  fzz =       gupxx * Axz * Axz + gupyy * Ayz * Ayz + gupzz * Azz * Azz + &
+       TWO * (gupxy * Axz * Ayz + gupxz * Axz * Azz + gupyz * Ayz * Azz)
+  fxy =       gupxx * Axx * Axy + gupyy * Axy * Ayy + gupzz * Axz * Ayz + &
+              gupxy *(Axx * Ayy + Axy * Axy)                            + &
+              gupxz *(Axx * Ayz + Axz * Axy)                            + &
+              gupyz *(Axy * Ayz + Axz * Ayy)
+  fxz =       gupxx * Axx * Axz + gupyy * Axy * Ayz + gupzz * Axz * Azz + &
+              gupxy *(Axx * Ayz + Axy * Axz)                            + &
+              gupxz *(Axx * Azz + Axz * Axz)                            + &
+              gupyz *(Axy * Azz + Axz * Ayz)
+  fyz =       gupxx * Axy * Axz + gupyy * Ayy * Ayz + gupzz * Ayz * Azz + &
+              gupxy *(Axy * Ayz + Ayy * Axz)                            + &
+              gupxz *(Axy * Azz + Ayz * Axz)                            + &
+              gupyz *(Ayy * Azz + Ayz * Ayz)
+
+  f = chin1
+! store D^i D_i Lap in trK_rhs
+  trK_rhs = f*trK_rhs
+          
+  Axx_rhs =           f * Axx_rhs+ alpn1 * (trK * Axx - TWO * fxx)  + &
+           TWO * (  Axx * betaxx +   Axy * betayx +   Axz * betazx )- &
+             F2o3 * Axx * div_beta
+
+  Ayy_rhs =           f * Ayy_rhs+ alpn1 * (trK * Ayy - TWO * fyy)  + &
+           TWO * (  Axy * betaxy +   Ayy * betayy +   Ayz * betazy )- &
+             F2o3 * Ayy * div_beta
+
+  Azz_rhs =           f * Azz_rhs+ alpn1 * (trK * Azz - TWO * fzz)  + &
+           TWO * (  Axz * betaxz +   Ayz * betayz +   Azz * betazz )- &
+             F2o3 * Azz * div_beta
+
+  Axy_rhs =           f * Axy_rhs+ alpn1 *( trK * Axy  - TWO * fxy )+ &
+                    Axx * betaxy                  +   Axz * betazy  + &
+                                     Ayy * betayx +   Ayz * betazx  + &
+             F1o3 * Axy * div_beta                -   Axy * betazz
+
+  Ayz_rhs =           f * Ayz_rhs+ alpn1 *( trK * Ayz  - TWO * fyz )+ &
+                    Axy * betaxz +   Ayy * betayz                   + &
+                    Axz * betaxy                  +   Azz * betazy  + &
+             F1o3 * Ayz * div_beta                -   Ayz * betaxx
+ 
+  Axz_rhs =           f * Axz_rhs+ alpn1 *( trK * Axz  - TWO * fxz )+ &
+                    Axx * betaxz +   Axy * betayz                   + &
+                                     Ayz * betayx +   Azz * betazx  + &
+             F1o3 * Axz * div_beta                -   Axz * betayy      !rhs for Aij
+
+! Compute trace of S_ij
+
+  S =  f * ( gupxx * Sxx + gupyy * Syy + gupzz * Szz + &
+     TWO * ( gupxy * Sxy + gupxz * Sxz + gupyz * Syz ) )
+
+  trK_rhs = - trK_rhs + alpn1 *( F1o3 * trK * trK         + &
+                gupxx * fxx + gupyy * fyy + gupzz * fzz   + &
+        TWO * ( gupxy * fxy + gupxz * fxz + gupyz * fyz ) + &
+       FOUR * PI * ( rho + S ))                                !rhs for trK
  
 !!!! gauge variable part

@@ -1000,15 +948,15 @@
 !!!!!!!!!advection term + Kreiss-Oliger dissipation (merged for cache efficiency)
 ! lopsided_kodis shares the symmetry_bd buffer between advection and
 ! dissipation, eliminating redundant full-grid copies. For metric variables
-! gxx/gyy/gzz (=dxx/dyy/dzz+1): stencil coefficients sum to zero,
-! so the constant offset has no effect on dissipation.
-
-  call lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
-  call lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
-  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
-  call lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+! gxx/gyy/gzz (=dxx/dyy/dzz+1): kodis stencil coefficients sum to zero,
+! so the constant offset has no effect on dissipation.
+
+  call lopsided_kodis(ex,X,Y,Z,gxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps)
+  call lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps)
+  call lopsided_kodis(ex,X,Y,Z,gzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps)

  call lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps)
  call lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps)
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
@@ -1022,16 +1022,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );

            #if (GAUGE == 2)
-            {
-                const double chi_sqrt = sqrt(chin1[i]);
-                const double damping = ONE - chi_sqrt;
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
            #else
-            {
-                const double damping = ONE - chin1[i];
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
            #endif

            dtSfx_rhs[i] = Gamx_rhs[i] - reta[i] * dtSfx[i];
@@ -1047,16 +1040,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
                        + gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );

            #if (GAUGE == 4)
-            {
-                const double chi_sqrt = sqrt(chin1[i]);
-                const double damping = ONE - chi_sqrt;
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
            #else
-            {
-                const double damping = ONE - chin1[i];
-                reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / (damping * damping);
-            }
+            reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - chin1[i]), 2.0 );
            #endif

            betax_rhs[i] = FF * Gamx[i] - reta[i] * betax[i];
@@ -1153,59 +1139,59 @@ int f_compute_rhs_bssn(int *ex, double &T,
            fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
            fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
            fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
-            // 7ms //
-            for (int i=0;i<all;i+=1) {
-                gxxx[i] = gxxx[i] - (  Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
-                                      + Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
-                gxyx[i] = gxyx[i] - (  Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
-                                      + Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
-                gxzx[i] = gxzx[i] - (  Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
-                                      + Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
-                gyyx[i] = gyyx[i] - (  Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
-                                      + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
-                gyzx[i] = gyzx[i] - (  Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
-                                      + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
-                gzzx[i] = gzzx[i] - (  Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
-                                      + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
-                gxxy[i] = gxxy[i] - (  Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
-                                      + Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
-                gxyy[i] = gxyy[i] - (  Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
-                                      + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
-                gxzy[i] = gxzy[i] - (  Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
-                                      + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
-                gyyy[i] = gyyy[i] - (  Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
-                                      + Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
-                gyzy[i] = gyzy[i] - (  Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
-                                      + Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
-                gzzy[i] = gzzy[i] - (  Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
-                                      + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
-                gxxz[i] = gxxz[i] - (  Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
-                                      + Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
-                gxyz[i] = gxyz[i] - (  Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
-                                      + Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
-                gxzz[i] = gxzz[i] - (  Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
-                                      + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
-                gyyz[i] = gyyz[i] - (  Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
-                                      + Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
-                gyzz[i] = gyzz[i] - (  Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
-                                      + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
-                gzzz[i] = gzzz[i] - (  Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
-                                      + Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];
+        }
+        // 7ms //
+        for (int i=0;i<all;i+=1) {
+            gxxx[i] = gxxx[i] - (  Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
+                                  + Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]) - chix[i]*Axx[i]/chin1[i];
+            gxyx[i] = gxyx[i] - (  Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
+                                  + Gamxxx[i] * Axy[i] + Gamyxx[i] * Ayy[i] + Gamzxx[i] * Ayz[i]) - chix[i]*Axy[i]/chin1[i];
+            gxzx[i] = gxzx[i] - (  Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
+                                  + Gamxxx[i] * Axz[i] + Gamyxx[i] * Ayz[i] + Gamzxx[i] * Azz[i]) - chix[i]*Axz[i]/chin1[i];
+            gyyx[i] = gyyx[i] - (  Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]
+                                  + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chix[i]*Ayy[i]/chin1[i];
+            gyzx[i] = gyzx[i] - (  Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]
+                                  + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chix[i]*Ayz[i]/chin1[i];
+            gzzx[i] = gzzx[i] - (  Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]
+                                  + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chix[i]*Azz[i]/chin1[i];
+            gxxy[i] = gxxy[i] - (  Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]
+                                  + Gamxxy[i] * Axx[i] + Gamyxy[i] * Axy[i] + Gamzxy[i] * Axz[i]) - chiy[i]*Axx[i]/chin1[i];
+            gxyy[i] = gxyy[i] - (  Gamxyy[i] * Axx[i] + Gamyyy[i] * Axy[i] + Gamzyy[i] * Axz[i]
+                                  + Gamxxy[i] * Axy[i] + Gamyxy[i] * Ayy[i] + Gamzxy[i] * Ayz[i]) - chiy[i]*Axy[i]/chin1[i];
+            gxzy[i] = gxzy[i] - (  Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
+                                  + Gamxxy[i] * Axz[i] + Gamyxy[i] * Ayz[i] + Gamzxy[i] * Azz[i]) - chiy[i]*Axz[i]/chin1[i];
+            gyyy[i] = gyyy[i] - (  Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]
+                                  + Gamxyy[i] * Axy[i] + Gamyyy[i] * Ayy[i] + Gamzyy[i] * Ayz[i]) - chiy[i]*Ayy[i]/chin1[i];
+            gyzy[i] = gyzy[i] - (  Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
+                                  + Gamxyy[i] * Axz[i] + Gamyyy[i] * Ayz[i] + Gamzyy[i] * Azz[i]) - chiy[i]*Ayz[i]/chin1[i];
+            gzzy[i] = gzzy[i] - (  Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]
+                                  + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiy[i]*Azz[i]/chin1[i];
+            gxxz[i] = gxxz[i] - (  Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]
+                                  + Gamxxz[i] * Axx[i] + Gamyxz[i] * Axy[i] + Gamzxz[i] * Axz[i]) - chiz[i]*Axx[i]/chin1[i];
+            gxyz[i] = gxyz[i] - (  Gamxyz[i] * Axx[i] + Gamyyz[i] * Axy[i] + Gamzyz[i] * Axz[i]
+                                  + Gamxxz[i] * Axy[i] + Gamyxz[i] * Ayy[i] + Gamzxz[i] * Ayz[i]) - chiz[i]*Axy[i]/chin1[i];
+            gxzz[i] = gxzz[i] - (  Gamxzz[i] * Axx[i] + Gamyzz[i] * Axy[i] + Gamzzz[i] * Axz[i]
+                                  + Gamxxz[i] * Axz[i] + Gamyxz[i] * Ayz[i] + Gamzxz[i] * Azz[i]) - chiz[i]*Axz[i]/chin1[i];
+            gyyz[i] = gyyz[i] - (  Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]
+                                  + Gamxyz[i] * Axy[i] + Gamyyz[i] * Ayy[i] + Gamzyz[i] * Ayz[i]) - chiz[i]*Ayy[i]/chin1[i];
+            gyzz[i] = gyzz[i] - (  Gamxzz[i] * Axy[i] + Gamyzz[i] * Ayy[i] + Gamzzz[i] * Ayz[i]
+                                  + Gamxyz[i] * Axz[i] + Gamyyz[i] * Ayz[i] + Gamzyz[i] * Azz[i]) - chiz[i]*Ayz[i]/chin1[i];
+            gzzz[i] = gzzz[i] - (  Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]
+                                  + Gamxzz[i] * Axz[i] + Gamyzz[i] * Ayz[i] + Gamzzz[i] * Azz[i]) - chiz[i]*Azz[i]/chin1[i];

-                movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
-                            + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
-                            + gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
-                movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
-                            + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
-                            + gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
-                movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
-                            + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
-                            + gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];
+            movx_Res[i] = gupxx[i]*gxxx[i] + gupyy[i]*gxyy[i] + gupzz[i]*gxzz[i]
+                        + gupxy[i]*gxyx[i] + gupxz[i]*gxzx[i] + gupyz[i]*gxzy[i]
+                        + gupxy[i]*gxxy[i] + gupxz[i]*gxxz[i] + gupyz[i]*gxyz[i];
+            movy_Res[i] = gupxx[i]*gxyx[i] + gupyy[i]*gyyy[i] + gupzz[i]*gyzz[i]
+                        + gupxy[i]*gyyx[i] + gupxz[i]*gyzx[i] + gupyz[i]*gyzy[i]
+                        + gupxy[i]*gxyy[i] + gupxz[i]*gxyz[i] + gupyz[i]*gyyz[i];
+            movz_Res[i] = gupxx[i]*gxzx[i] + gupyy[i]*gyzy[i] + gupzz[i]*gzzz[i]
+                        + gupxy[i]*gyzx[i] + gupxz[i]*gzzx[i] + gupyz[i]*gzzy[i]
+                        + gupxy[i]*gxzy[i] + gupxz[i]*gxzz[i] + gupyz[i]*gyzz[i];

-                movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
-                movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
-                movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
-            }
+            movx_Res[i] = movx_Res[i] - F2o3*Kx[i] - F8*PI*Sx[i];
+            movy_Res[i] = movy_Res[i] - F2o3*Ky[i] - F8*PI*Sy[i];
+            movz_Res[i] = movz_Res[i] - F2o3*Kz[i] - F8*PI*Sz[i];
        }


--- a/AMSS_NCKU_source/cgh.C
+++ b/AMSS_NCKU_source/cgh.C
@@ -23,14 +23,10 @@ using namespace std;
 #include <mpi.h>

 #include "macrodef.h"
-#include "misc.h"
-#include "cgh.h"
-#include "Parallel.h"
-#include "parameters.h"
-#ifdef USE_GPU
-#include "bssn_gpu.h"
-#include "bssn_cuda_ops.h"
-#endif
+#include "misc.h"
+#include "cgh.h"
+#include "Parallel.h"
+#include "parameters.h"

 //================================================================================================

@@ -885,20 +881,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
      tmPat = construct_patchlist(lev, Symmetry);
      // tmPat construction completes
      Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-#ifdef USE_GPU
-      bssn_gpu_clear_cached_device_buffers();
-      bssn_gpu_release_pinned_host_buffers();
-      bssn_cuda_release_rk4_caches();
-      bssn_cuda_release_interp_caches();
-      patch_release_interp_plan_cache();
-#endif
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
 #if (RPB == 1)
      Parallel::destroypsuList_bam(bdsul[lev]);
      Parallel::destroypsuList_bam(rsul[lev]);
@@ -921,20 +910,13 @@ void cgh::recompose_cgh(int nprocs, bool *lev_flag,
      tmPat = construct_patchlist(lev, Symmetry);
      // tmPat construction completes
      Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
-      //    checkPatchList(tmPat,true);
-      bool CC = (lev > trfls);
-      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-#ifdef USE_GPU
-      bssn_gpu_clear_cached_device_buffers();
-      bssn_gpu_release_pinned_host_buffers();
-      bssn_cuda_release_rk4_caches();
-      bssn_cuda_release_interp_caches();
-      patch_release_interp_plan_cache();
-#endif
-      Parallel::KillBlocks(PatL[lev]);
-      PatL[lev]->destroyList();
-      PatL[lev] = tmPat;
+      //    checkPatchList(tmPat,true);
+      bool CC = (lev > trfls);
+      Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+      Parallel::KillBlocks(PatL[lev]);
+      PatL[lev]->destroyList();
+      PatL[lev] = tmPat;
 #if (RPB == 1)
 #error "not support yet"
 #endif
@@ -1536,20 +1518,13 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
  tmPat = construct_patchlist(lev, Symmetry);
  // tmPat construction completes
  Parallel::distribute(tmPat, nprocs, ingfs, fngfs, false);
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-
-#ifdef USE_GPU
-  bssn_gpu_clear_cached_device_buffers();
-  bssn_gpu_release_pinned_host_buffers();
-  bssn_cuda_release_rk4_caches();
-  bssn_cuda_release_interp_caches();
-  patch_release_interp_plan_cache();
-#endif
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
 }
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
 #warning "recompose_cgh_Onelevel is not implimented yet"
@@ -1565,21 +1540,14 @@ void cgh::recompose_cgh_Onelevel(int nprocs, int lev,
  // tmPat construction completes
  Parallel::distribute(tmPat, end_rank[lev] - start_rank[lev] + 1, ingfs, fngfs, false, start_rank[lev], end_rank[lev]);
  misc::tillherecheck(Commlev[lev], start_rank[lev], "after distribute");
-  //    checkPatchList(tmPat,true);
-  bool CC = (lev > trfls);
-  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
-  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
-
-#ifdef USE_GPU
-  bssn_gpu_clear_cached_device_buffers();
-  bssn_gpu_release_pinned_host_buffers();
-  bssn_cuda_release_rk4_caches();
-  bssn_cuda_release_interp_caches();
-  patch_release_interp_plan_cache();
-#endif
-  Parallel::KillBlocks(PatL[lev]);
-  PatL[lev]->destroyList();
-  PatL[lev] = tmPat;
+  //    checkPatchList(tmPat,true);
+  bool CC = (lev > trfls);
+  Parallel::fill_level_data(tmPat, PatL[lev], PatL[lev - 1], OldList, StateList, FutureList, tmList, Symmetry, BB, CC);
+  misc::tillherecheck(Commlev[lev], start_rank[lev], "after fill_level_data");
+
+  Parallel::KillBlocks(PatL[lev]);
+  PatL[lev]->destroyList();
+  PatL[lev] = tmPat;
 }


--- a/AMSS_NCKU_source/diff_newwb.f90
+++ b/AMSS_NCKU_source/diff_newwb.f90
@@ -33,7 +33,7 @@
  real*8 :: dX,dY,dZ
  real*8,dimension(0:ex(1),0:ex(2),0:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d2dx,d2dy,d2dz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -137,7 +137,7 @@
  real*8 :: dX
  real*8,dimension(0:ex(1),0:ex(2),0:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
  real*8 :: d2dx
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8,  parameter :: ZEO=0.d0,ONE=1.d0, F60=6.d1
@@ -1512,9 +1512,8 @@
  real*8 :: dX,dY,dZ
  real*8,dimension(-1:ex(1),-1:ex(2),-1:ex(3))   :: fh
  real*8, dimension(3) :: SoA
-  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
-  integer :: i_core_min,i_core_max,j_core_min,j_core_max,k_core_min,k_core_max
-  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
+  integer :: imin,jmin,kmin,imax,jmax,kmax,i,j,k
+  real*8  :: Sdxdx,Sdydy,Sdzdz,Fdxdx,Fdydy,Fdzdz
  real*8  :: Sdxdy,Sdxdz,Sdydz,Fdxdy,Fdxdz,Fdydz
  integer, parameter :: NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2
  real*8, parameter :: ZEO=0.d0, ONE=1.d0, TWO=2.d0, F1o4=2.5d-1, F9=9.d0,  F45=4.5d1
@@ -1561,55 +1560,17 @@

  fxx = ZEO
  fyy = ZEO
-  fzz = ZEO
-  fxy = ZEO
-  fxz = ZEO
-  fyz = ZEO
-
-  i_core_min = max(1, imin+2)
-  i_core_max = min(ex(1), imax-2)
-  j_core_min = max(1, jmin+2)
-  j_core_max = min(ex(2), jmax-2)
-  k_core_min = max(1, kmin+2)
-  k_core_max = min(ex(3), kmax-2)
-
-  if(i_core_min <= i_core_max .and. j_core_min <= j_core_max .and. k_core_min <= k_core_max)then
-   do k=k_core_min,k_core_max
-   do j=j_core_min,j_core_max
-   do i=i_core_min,i_core_max
-! interior points always use 4th-order stencils without branch checks
-      fxx(i,j,k) = Fdxdx*(-fh(i-2,j,k)+F16*fh(i-1,j,k)-F30*fh(i,j,k) &
-                          -fh(i+2,j,k)+F16*fh(i+1,j,k)              )
-      fyy(i,j,k) = Fdydy*(-fh(i,j-2,k)+F16*fh(i,j-1,k)-F30*fh(i,j,k) &
-                          -fh(i,j+2,k)+F16*fh(i,j+1,k)              )
-      fzz(i,j,k) = Fdzdz*(-fh(i,j,k-2)+F16*fh(i,j,k-1)-F30*fh(i,j,k) &
-                          -fh(i,j,k+2)+F16*fh(i,j,k+1)              )
-      fxy(i,j,k) = Fdxdy*(     (fh(i-2,j-2,k)-F8*fh(i-1,j-2,k)+F8*fh(i+1,j-2,k)-fh(i+2,j-2,k))  &
-                          -F8 *(fh(i-2,j-1,k)-F8*fh(i-1,j-1,k)+F8*fh(i+1,j-1,k)-fh(i+2,j-1,k))  &
-                          +F8 *(fh(i-2,j+1,k)-F8*fh(i-1,j+1,k)+F8*fh(i+1,j+1,k)-fh(i+2,j+1,k))  &
-                          -    (fh(i-2,j+2,k)-F8*fh(i-1,j+2,k)+F8*fh(i+1,j+2,k)-fh(i+2,j+2,k)))
-      fxz(i,j,k) = Fdxdz*(     (fh(i-2,j,k-2)-F8*fh(i-1,j,k-2)+F8*fh(i+1,j,k-2)-fh(i+2,j,k-2))  &
-                          -F8 *(fh(i-2,j,k-1)-F8*fh(i-1,j,k-1)+F8*fh(i+1,j,k-1)-fh(i+2,j,k-1))  &
-                          +F8 *(fh(i-2,j,k+1)-F8*fh(i-1,j,k+1)+F8*fh(i+1,j,k+1)-fh(i+2,j,k+1))  &
-                          -    (fh(i-2,j,k+2)-F8*fh(i-1,j,k+2)+F8*fh(i+1,j,k+2)-fh(i+2,j,k+2)))
-      fyz(i,j,k) = Fdydz*(     (fh(i,j-2,k-2)-F8*fh(i,j-1,k-2)+F8*fh(i,j+1,k-2)-fh(i,j+2,k-2))  &
-                          -F8 *(fh(i,j-2,k-1)-F8*fh(i,j-1,k-1)+F8*fh(i,j+1,k-1)-fh(i,j+2,k-1))  &
-                          +F8 *(fh(i,j-2,k+1)-F8*fh(i,j-1,k+1)+F8*fh(i,j+1,k+1)-fh(i,j+2,k+1))  &
-                          -    (fh(i,j-2,k+2)-F8*fh(i,j-1,k+2)+F8*fh(i,j+1,k+2)-fh(i,j+2,k+2)))
-   enddo
-   enddo
-   enddo
-  endif
-
-  do k=1,ex(3)
-  do j=1,ex(2)
-  do i=1,ex(1)
-      if(i>=i_core_min .and. i<=i_core_max .and. &
-         j>=j_core_min .and. j<=j_core_max .and. &
-         k>=k_core_min .and. k<=k_core_max) cycle
-!~~~~~~ fxx
-        if(i+2 <= imax .and. i-2 >= imin)then
-!
+  fzz = ZEO
+  fxy = ZEO
+  fxz = ZEO
+  fyz = ZEO
+
+  do k=1,ex(3)
+  do j=1,ex(2)
+  do i=1,ex(1)
+!~~~~~~ fxx
+        if(i+2 <= imax .and. i-2 >= imin)then
+!
 !               - f(i-2) + 16 f(i-1) - 30 f(i) + 16 f(i+1) - f(i+2)
 !  fxx(i) = ----------------------------------------------------------
 !                                  12 dx^2 
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -141,26 +141,12 @@ void fdderivs(const int ex[3],
    const int j4_hi = ex2 - 3;
    const int k4_hi = ex3 - 3;

-    /*
-     * Strategy A:
-     * Avoid redundant work in overlap of 2nd/4th-order regions.
-     * Only compute 2nd-order on shell points that are NOT overwritten by
-     * the 4th-order pass.
-     */
-    const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi);
-
    if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) {
        for (int k0 = k2_lo; k0 <= k2_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j2_lo; j0 <= j2_hi; ++j0) {
                const int jF = j0 + 1;
                for (int i0 = i2_lo; i0 <= i2_hi; ++i0) {
-                    if (has4 &&
-                        i0 >= i4_lo && i0 <= i4_hi &&
-                        j0 >= j4_lo && j0 <= j4_hi &&
-                        k0 >= k4_lo && k0 <= k4_hi) {
-                        continue;
-                    }
                    const int iF = i0 + 1;
                    const size_t p = idx_ex(i0, j0, k0, ex);

@@ -207,7 +193,7 @@ void fdderivs(const int ex[3],
        }
    }

-    if (has4) {
+    if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) {
        for (int k0 = k4_lo; k0 <= k4_hi; ++k0) {
            const int kF = k0 + 1;
            for (int j0 = j4_lo; j0 <= j4_hi; ++j0) {
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -30,8 +30,8 @@ CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
-
-.SUFFIXES: .o .f90 .C .for .cu
+
+.SUFFIXES: .o .f90 .C .for .cu

 .f90.o:
 	$(f90) $(f90appflags) -c $< -o $@
@@ -64,8 +64,8 @@ lopsided_c.o: lopsided_c.C
 lopsided_kodis_c.o: lopsided_kodis_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

-#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
-#	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
+interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
+	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@

 ## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
 TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
@@ -105,12 +105,13 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
 	   NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
 	   
-C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
-           cgh.o bssn_class.o surface_integral.o ShellPatch.o\
-	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
-	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
-           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
-	   NullShellPatch2_Evo.o bssn_cuda_step.o writefile_f.o
+C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
+           cgh.o surface_integral.o ShellPatch.o\
+	   bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
+	   bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
+           Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
+	   NullShellPatch2_Evo.o \
+	   bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o

 F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
 	   prolongrestrict_cell.o prolongrestrict_vertex.o\
@@ -142,7 +143,7 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o

 TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o 

-CUDAFILES = bssn_gpu.o bssn_cuda_ops.o
+CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o

 # file dependences
 $(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -9,7 +9,6 @@ filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
-CUDA_LDLIBS = -L/usr/local/cuda-12.9/targets/x86_64-linux/lib -lcudart

 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
@@ -25,8 +24,6 @@ ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif

-LDLIBS := $(CUDA_LDLIBS) $(LDLIBS)
-
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
--- a/AMSS_NCKU_source/prolongrestrict_cell.f90
+++ b/AMSS_NCKU_source/prolongrestrict_cell.f90
@@ -1956,13 +1956,11 @@

  real*8,dimension(3) :: CD,FD
  real*8 :: tmp_yz(extc(1), 6)      ! 存储整条 X 线上 6 个 Y 轴偏置的 Z 向插值结果
-  real*8 :: tmp_xyz_line(-2:extc(1))   ! 包含 X 向 6 点模板访问所需下界
+  real*8 :: tmp_xyz_line(extc(1))   ! 存储整条 X 线上完成 Y 向融合后的结果
  real*8 :: v1, v2, v3, v4, v5, v6
-  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max,ic_min,ic_max,kc_min,kc_max
-  integer :: i_lo, i_hi, j_lo, j_hi, k_lo, k_hi
-  logical :: need_full_symmetry
+  integer :: ic, jc, kc, ix_offset,ix,iy,iz,jc_min,jc_max
  real*8 :: res_line
-  real*8 :: tmp_z_slab(-2:extc(1), -2:extc(2))  ! 包含 Y/X 向模板访问所需下界
+  real*8 :: tmp_z_slab(extc(1), extc(2))  ! 分配在 k 循环外
  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::prolong3: this routine only surport 3 dimension"
     write(*,*)"dim = ",wei
@@ -2065,41 +2063,24 @@
     endif
  enddo

-  ic_min = minval(cix(imino:imaxo))
-  ic_max = maxval(cix(imino:imaxo))
-  jc_min = minval(ciy(jmino:jmaxo))
-  jc_max = maxval(ciy(jmino:jmaxo))
-  kc_min = minval(ciz(kmino:kmaxo))
-  kc_max = maxval(ciz(kmino:kmaxo))
-
-  maxcx = ic_max
-  maxcy = jc_max
-  maxcz = kc_max
+  maxcx = maxval(cix(imino:imaxo))
+  maxcy = maxval(ciy(jmino:jmaxo))
+  maxcz = maxval(ciz(kmino:kmaxo))
  if(maxcx+3 > extc(1) .or. maxcy+3 > extc(2) .or. maxcz+3 > extc(3))then
     write(*,*)"error in prolong"
     return
  endif

-  i_lo = ic_min - 2
-  i_hi = ic_max + 3
-  j_lo = jc_min - 2
-  j_hi = jc_max + 3
-  k_lo = kc_min - 2
-  k_hi = kc_max + 3
-  need_full_symmetry = (i_lo < 1) .or. (j_lo < 1) .or. (k_lo < 1)
-  if(need_full_symmetry)then
-     call symmetry_bd(3,extc,func,funcc,SoA)
-  else
-     funcc(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi) = func(i_lo:i_hi,j_lo:j_hi,k_lo:k_hi)
-  endif
-
+  call symmetry_bd(3,extc,func,funcc,SoA)
     ! 对每个 k（pz, kc 固定）预计算 Z 向插值的 2D 切片
+jc_min = minval(ciy(jmino:jmaxo))
+jc_max = maxval(ciy(jmino:jmaxo))

 do k = kmino, kmaxo
    pz = piz(k); kc = ciz(k)
    ! --- Pass 1: Z 方向，只算一次 ---
-    do iy = jc_min-2, jc_max+3   ! 仅需的 iy 范围（对应 jc-2:jc+3）
-        do ii = ic_min-2, ic_max+3  ! 仅需的 ii 范围（对应 cix-2:cix+3）
+    do iy = jc_min-3, jc_max+3   ! 仅需的 iy 范围
+        do ii = imini-3, imaxi+3  ! 仅需的 ii 范围
            tmp_z_slab(ii, iy) = sum(WC(:,pz) * funcc(ii, iy, kc-2:kc+3))
        end do
    end do
@@ -2107,7 +2088,7 @@ do k = kmino, kmaxo
    do j = jmino, jmaxo
        py = piy(j); jc = ciy(j)
        ! --- Pass 2: Y 方向 ---
-        do ii = ic_min-2, ic_max+3
+        do ii = imini-3, imaxi+3
            tmp_xyz_line(ii) = sum(WC(:,py) * tmp_z_slab(ii, jc-2:jc+3))
        end do
        ! --- Pass 3: X 方向 ---
@@ -2370,12 +2351,9 @@ end do

  real*8,dimension(3) :: CD,FD

-  real*8 :: tmp_xz_plane(-1:extf(1), 6)
-  real*8 :: tmp_x_line(-1:extf(1))
+  real*8 :: tmp_xz_plane(extf(1), 6) 
+  real*8 :: tmp_x_line(extf(1))
  integer :: fi, fj, fk, ii, jj, kk
-  integer :: fi_min, fi_max, ii_lo, ii_hi
-  integer :: fj_min, fj_max, fk_min, fk_max, jj_lo, jj_hi, kk_lo, kk_hi
-  logical :: need_full_symmetry

  if(wei.ne.3)then
     write(*,*)"prolongrestrict.f90::restrict3: this routine only surport 3 dimension"
@@ -2455,34 +2433,7 @@ end do
          stop
  endif

-  ! 仅计算 X 向最终写回所需的窗口：
-  ! func(i,j,k) 只访问 tmp_x_line(fi-2:fi+3)
-  fi_min = 2*(imino + lbc(1) - 1) - 1 - lbf(1) + 1
-  fi_max = 2*(imaxo + lbc(1) - 1) - 1 - lbf(1) + 1
-  fj_min = 2*(jmino + lbc(2) - 1) - 1 - lbf(2) + 1
-  fj_max = 2*(jmaxo + lbc(2) - 1) - 1 - lbf(2) + 1
-  fk_min = 2*(kmino + lbc(3) - 1) - 1 - lbf(3) + 1
-  fk_max = 2*(kmaxo + lbc(3) - 1) - 1 - lbf(3) + 1
-  ii_lo = fi_min - 2
-  ii_hi = fi_max + 3
-  jj_lo = fj_min - 2
-  jj_hi = fj_max + 3
-  kk_lo = fk_min - 2
-  kk_hi = fk_max + 3
-  if(ii_lo < -1 .or. ii_hi > extf(1) .or. &
-     jj_lo < -1 .or. jj_hi > extf(2) .or. &
-     kk_lo < -1 .or. kk_hi > extf(3))then
-      write(*,*)"restrict3: invalid stencil window"
-      write(*,*)"ii=",ii_lo,ii_hi," jj=",jj_lo,jj_hi," kk=",kk_lo,kk_hi
-      write(*,*)"extf=",extf
-      stop
-  endif
-  need_full_symmetry = (ii_lo < 1) .or. (jj_lo < 1) .or. (kk_lo < 1)
-  if(need_full_symmetry)then
-      call symmetry_bd(2,extf,funf,funff,SoA)
-  else
-      funff(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi) = funf(ii_lo:ii_hi,jj_lo:jj_hi,kk_lo:kk_hi)
-  endif
+  call symmetry_bd(2,extf,funf,funff,SoA)

 !~~~~~~> restriction start...
 do k = kmino, kmaxo
@@ -2494,7 +2445,7 @@ do k = kmino, kmaxo
        ! 优化点 1: 显式展开 Z 方向计算，减少循环开销
        ! 确保 ii 循环是最内层且连续访问
        !DIR$ VECTOR ALWAYS
-        do ii = ii_lo, ii_hi
+        do ii = 1, extf(1)
            ! 预计算当前 j 对应的 6 行在 Z 方向的压缩结果
            ! 这里直接硬编码 jj 的偏移，彻底消除一层循环
            tmp_xz_plane(ii, 1) = C1*(funff(ii,fj-2,fk-2)+funff(ii,fj-2,fk+3)) + &
@@ -2519,7 +2470,7 @@ do k = kmino, kmaxo

        ! 优化点 2: 同样向量化 Y 方向压缩
        !DIR$ VECTOR ALWAYS
-        do ii = ii_lo, ii_hi
+        do ii = 1, extf(1)
            tmp_x_line(ii) = C1*(tmp_xz_plane(ii, 1) + tmp_xz_plane(ii, 6)) + &
                            C2*(tmp_xz_plane(ii, 2) + tmp_xz_plane(ii, 5)) + &
                            C3*(tmp_xz_plane(ii, 3) + tmp_xz_plane(ii, 4))
--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -180,64 +180,19 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
 //|============================================================================
 //| Destructor
 //|============================================================================
-surface_integral::~surface_integral()
-{
-  release_cached_buffers();
-  delete[] nx_g;
-  delete[] ny_g;
-  delete[] nz_g;
-  delete[] arcostheta;
-#ifdef GaussInt
-  delete[] wtcostheta;
-#endif
-}
-
-void surface_integral::get_surface_points(double rex, double **pox)
-{
-  SpherePointCache &cache = sphere_point_cache[rex];
-  if (!cache.pox[0])
-  {
-    for (int i = 0; i < 3; ++i)
-      cache.pox[i] = new double[n_tot];
-    for (int n = 0; n < n_tot; ++n)
-    {
-      cache.pox[0][n] = rex * nx_g[n];
-      cache.pox[1][n] = rex * ny_g[n];
-      cache.pox[2][n] = rex * nz_g[n];
-    }
-  }
-
-  pox[0] = cache.pox[0];
-  pox[1] = cache.pox[1];
-  pox[2] = cache.pox[2];
-}
-
-double *surface_integral::get_shellf_buffer(int num_var)
-{
-  double *&buffer = shellf_cache[num_var];
-  if (!buffer)
-    buffer = new double[n_tot * num_var];
-  return buffer;
-}
-
-void surface_integral::release_cached_buffers()
-{
-  for (map<double, SpherePointCache>::iterator it = sphere_point_cache.begin(); it != sphere_point_cache.end(); ++it)
-  {
-    delete[] it->second.pox[0];
-    delete[] it->second.pox[1];
-    delete[] it->second.pox[2];
-    it->second.pox[0] = it->second.pox[1] = it->second.pox[2] = 0;
-  }
-  sphere_point_cache.clear();
-
-  for (map<int, double *>::iterator it = shellf_cache.begin(); it != shellf_cache.end(); ++it)
-    delete[] it->second;
-  shellf_cache.clear();
-}
-//|----------------------------------------------------------------
-//  spin weighted spinw component of psi4, general routine
-//  l takes from spinw to maxl; m takes from -l to l
+surface_integral::~surface_integral()
+{
+  delete[] nx_g;
+  delete[] ny_g;
+  delete[] nz_g;
+  delete[] arcostheta;
+#ifdef GaussInt
+  delete[] wtcostheta;
+#endif
+}
+//|----------------------------------------------------------------
+//  spin weighted spinw component of psi4, general routine
+//  l takes from spinw to maxl; m takes from -l to l
 //|----------------------------------------------------------------
 void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
@@ -254,9 +209,16 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }

  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
@@ -272,7 +234,8 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }

-  double *shellf = get_shellf_buffer(InList);
+  double *shellf;
+  shellf = new double[n_tot * InList];

  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);

@@ -412,10 +375,14 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *

  //|------= Free memory.

-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
 void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
@@ -435,11 +402,19 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
-
-  double *shellf = get_shellf_buffer(InList);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];

  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");

@@ -602,10 +577,14 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *

  //|------= Free memory.

-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
 //|----------------------------------------------------------------
 //  for shell patch
 //|----------------------------------------------------------------
@@ -618,11 +597,19 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }

-  double *shellf = get_shellf_buffer(InList);
+  double *shellf;
+  shellf = new double[n_tot * InList];

  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);

@@ -2583,8 +2570,12 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  Rout[5] = sy;
  Rout[6] = sz;

-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
 void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
@@ -2646,11 +2637,19 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  DG_List->insert(Ayz);
  DG_List->insert(Azz);

-  int n;
-  double *pox[3];
-  get_surface_points(rex, pox);
-
-  double *shellf = get_shellf_buffer(InList);
+  int n;
+  double *pox[3];
+  for (int i = 0; i < 3; i++)
+    pox[i] = new double[n_tot];
+  for (n = 0; n < n_tot; n++)
+  {
+    pox[0][n] = rex * nx_g[n];
+    pox[1][n] = rex * ny_g[n];
+    pox[2][n] = rex * nz_g[n];
+  }
+
+  double *shellf;
+  shellf = new double[n_tot * InList];

  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
@@ -2840,8 +2839,12 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  Rout[5] = sy;
  Rout[6] = sz;

-  DG_List->clearList();
-}
+  delete[] pox[0];
+  delete[] pox[1];
+  delete[] pox[2];
+  delete[] shellf;
+  DG_List->clearList();
+}
 //|----------------------------------------------------------------
 //  for shell patch
 //|----------------------------------------------------------------
--- a/AMSS_NCKU_source/surface_integral.h
+++ b/AMSS_NCKU_source/surface_integral.h
@@ -20,41 +20,25 @@ using namespace std;
 #include "cgh.h"
 #include "ShellPatch.h"
 #include "NullShellPatch.h"
-#include "NullShellPatch2.h"
-#include "var.h"
-#include "monitor.h"
-#include <map>
+#include "NullShellPatch2.h"
+#include "var.h"
+#include "monitor.h"

 class surface_integral
 {

-private:
-	struct SpherePointCache
-	{
-		double *pox[3];
-		SpherePointCache()
-		{
-			pox[0] = pox[1] = pox[2] = 0;
-		}
-	};
-
-	int Symmetry, factor;
-	int N_theta, N_phi; // Number of points in Theta & Phi directions
-	double dphi, dcostheta;
-	double *arcostheta, *wtcostheta;
-	int n_tot; // size of arrays
-
-	double *nx_g, *ny_g, *nz_g; // global list of unit normals
-	int myrank, cpusize;
-	map<double, SpherePointCache> sphere_point_cache;
-	map<int, double *> shellf_cache;
-
-	void get_surface_points(double rex, double **pox);
-	double *get_shellf_buffer(int num_var);
-	void release_cached_buffers();
-
-public:
-	surface_integral(int iSymmetry);
+private:
+	int Symmetry, factor;
+	int N_theta, N_phi; // Number of points in Theta & Phi directions
+	double dphi, dcostheta;
+	double *arcostheta, *wtcostheta;
+	int n_tot; // size of arrays
+
+	double *nx_g, *ny_g, *nz_g; // global list of unit normals
+	int myrank, cpusize;
+
+public:
+	surface_integral(int iSymmetry);
 	~surface_integral();

 	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,7 +9,6 @@


 import AMSS_NCKU_Input as input_data
-import os
 import subprocess
 import time

@@ -58,48 +57,6 @@ BUILD_JOBS = 64
 ##################################################################


-##################################################################
-
-def prepare_gpu_runtime_env():
-    """
-    Create a user-private CUDA MPS environment for GPU runs.
-
-    On shared machines another user's daemon may already occupy the default
-    /tmp/nvidia-mps pipe directory, which makes plain cudaSetDevice/cudaMalloc
-    fail with cudaErrorMpsConnectionFailed.  Binding AMSS-NCKU to a private
-    pipe directory avoids cross-user interference.
-    """
-    env = os.environ.copy()
-
-    pipe_dir = env.get("CUDA_MPS_PIPE_DIRECTORY", f"/tmp/amss-ncku-mps-{os.getuid()}")
-    log_dir  = env.get("CUDA_MPS_LOG_DIRECTORY",  f"/tmp/amss-ncku-mps-log-{os.getuid()}")
-
-    os.makedirs(pipe_dir, exist_ok=True)
-    os.makedirs(log_dir, exist_ok=True)
-
-    env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
-    env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
-
-    control_socket = os.path.join(pipe_dir, "control")
-    if not os.path.exists(control_socket):
-        start = subprocess.run(
-            ["nvidia-cuda-mps-control", "-d"],
-            env=env,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.DEVNULL,
-        )
-        if start.returncode != 0:
-            print(f" Warning: failed to start private CUDA MPS daemon in {pipe_dir}")
-        else:
-            print(f" Using private CUDA MPS pipe directory: {pipe_dir}")
-    else:
-        print(f" Using existing private CUDA MPS pipe directory: {pipe_dir}")
-
-    return env
-
-##################################################################
-
-

 ##################################################################

@@ -189,29 +146,16 @@ def run_ABE():

    ## Define the command to run; cast other values to strings as needed
    
-    run_env = None
-
    if (input_data.GPU_Calculation == "no"):
        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        run_env = prepare_gpu_runtime_env()
-        if int(input_data.MPI_processes) == 1:
-            mpi_command = "./ABEGPU"
-        else:
-            mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
 
    ## Execute the MPI command and stream output
-    mpi_process = subprocess.Popen(
-        mpi_command,
-        shell=True,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        env=run_env,
-    )
+    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

    ## Write ABE run output to file while printing to stdout
    with open(mpi_command_outfile, 'w') as file0:
Author	SHA1	Message	Date
jaunatisblue	12e1f63d50	prolong3: 减少Z-pass 冗余计算	2026-03-02 21:20:49 +08:00
jaunatisblue	47f91ff46f	prolong3：提升cache命中率	2026-03-02 10:31:46 +08:00
jaunatisblue	672b7ebee2	修改prolong	2026-03-02 02:01:07 +08:00
jaunatisblue	63bf180159	对prolong3做访存优化	2026-03-02 01:16:10 +08:00