Cache repeated interpolation plans

2026-04-09 15:21:01 +08:00
parent 06fa643365
commit 42e851d19a
4 changed files with 428 additions and 370 deletions
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -2,13 +2,14 @@
 #include <iostream>
 #include <iomanip>
 #include <fstream>
-#include <cstdlib>
-#include <cstdio>
-#include <string>
-#include <cmath>
-#include <new>
-#include <vector>
-using namespace std;
+#include <cstdlib>
+#include <cstdio>
+#include <string>
+#include <cmath>
+#include <new>
+#include <map>
+#include <vector>
+using namespace std;

 #include "misc.h"
 #include "MPatch.h"
@@ -40,6 +41,44 @@ struct InterpVarDesc
  double soa[dim];
 };

+struct InterpPlanKey
+{
+  const Patch *patch;
+  const double *x;
+  const double *y;
+  const double *z;
+  int NN;
+  int Symmetry;
+  int myrank;
+};
+
+struct InterpPlanKeyLess
+{
+  bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const
+  {
+    if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch;
+    if (lhs.x != rhs.x) return lhs.x < rhs.x;
+    if (lhs.y != rhs.y) return lhs.y < rhs.y;
+    if (lhs.z != rhs.z) return lhs.z < rhs.z;
+    if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN;
+    if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry;
+    return lhs.myrank < rhs.myrank;
+  }
+};
+
+struct CachedInterpPlan
+{
+  int nblocks;
+  vector<int> owner_rank;
+  vector<int> owner_block;
+  vector<vector<int> > block_points;
+  vector<vector<double> > block_px;
+  vector<vector<double> > block_py;
+  vector<vector<double> > block_pz;
+
+  CachedInterpPlan() : nblocks(0) {}
+};
+
 struct InterpBlockView
 {
  Block *bp;
@@ -229,10 +268,124 @@ bool should_try_cuda_interp(int ordn, int num_points, int num_var)
  return num_points * num_var >= 256;
 }

+CachedInterpPlan &get_cached_interp_plan(Patch *patch,
+                                         int NN, double **XX,
+                                         int Symmetry, int myrank,
+                                         const double *DH,
+                                         const BlockBinIndex &block_index,
+                                         bool report_bounds_here,
+                                         bool allow_missing_points)
+{
+  static map<InterpPlanKey, CachedInterpPlan, InterpPlanKeyLess> cache;
+
+  InterpPlanKey key;
+  key.patch = patch;
+  key.x = XX[0];
+  key.y = XX[1];
+  key.z = XX[2];
+  key.NN = NN;
+  key.Symmetry = Symmetry;
+  key.myrank = myrank;
+
+  map<InterpPlanKey, CachedInterpPlan, InterpPlanKeyLess>::iterator it = cache.find(key);
+  if (it != cache.end() && it->second.nblocks == static_cast<int>(block_index.views.size()))
+    return it->second;
+
+  CachedInterpPlan &plan = cache[key];
+  plan = CachedInterpPlan();
+  plan.nblocks = static_cast<int>(block_index.views.size());
+  plan.owner_rank.assign(NN, -1);
+  plan.owner_block.assign(NN, -1);
+  plan.block_points.resize(plan.nblocks);
+  plan.block_px.resize(plan.nblocks);
+  plan.block_py.resize(plan.nblocks);
+  plan.block_pz.resize(plan.nblocks);
+
+  for (int j = 0; j < NN; ++j)
+  {
+    double pox[dim];
+    for (int i = 0; i < dim; ++i)
+    {
+      pox[i] = XX[i][j];
+      if (report_bounds_here &&
+          (XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] ||
+           XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i]))
+      {
+        cout << "Patch::Interp_Points: point (";
+        for (int k = 0; k < dim; ++k)
+        {
+          cout << XX[k][j];
+          if (k < dim - 1)
+            cout << ",";
+          else
+            cout << ") is out of current Patch." << endl;
+        }
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+    }
+
+    const int block_i = find_block_index_for_point(block_index, pox, DH);
+    if (block_i >= 0)
+    {
+      Block *BP = block_index.views[block_i].bp;
+      plan.owner_rank[j] = BP->rank;
+      plan.owner_block[j] = block_i;
+      if (BP->rank == myrank)
+      {
+        plan.block_points[block_i].push_back(j);
+        plan.block_px[block_i].push_back(XX[0][j]);
+        plan.block_py[block_i].push_back(XX[1][j]);
+        plan.block_pz[block_i].push_back(XX[2][j]);
+      }
+    }
+  }
+
+  if (!allow_missing_points && report_bounds_here)
+  {
+    for (int j = 0; j < NN; ++j)
+    {
+      if (plan.owner_rank[j] >= 0)
+        continue;
+      cout << "ERROR: Patch::Interp_Points fails to find point (";
+      for (int d = 0; d < dim; ++d)
+      {
+        cout << XX[d][j];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")";
+      }
+      cout << " on Patch (";
+      for (int d = 0; d < dim; ++d)
+      {
+        cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")--";
+      }
+      cout << "(";
+      for (int d = 0; d < dim; ++d)
+      {
+        cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d];
+        if (d < dim - 1)
+          cout << ",";
+        else
+          cout << ")" << endl;
+      }
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+
+  return plan;
+}
+
 bool run_cuda_interp_for_block(Block *BP,
                               const vector<InterpVarDesc> &vars,
                               const vector<int> &point_ids,
-                               double **XX,
+                               const vector<double> &px,
+                               const vector<double> &py,
+                               const vector<double> &pz,
                               double *Shellf,
                               int num_var,
                               int ordn,
@@ -251,15 +404,6 @@ bool run_cuda_interp_for_block(Block *BP,
  }

  const int npts = static_cast<int>(point_ids.size());
-  vector<double> px(npts), py(npts), pz(npts);
-  for (int p = 0; p < npts; ++p)
-  {
-    const int j = point_ids[p];
-    px[p] = XX[0][j];
-    py[p] = XX[1][j];
-    pz[p] = XX[2][j];
-  }
-
  vector<double> out(static_cast<size_t>(npts) * static_cast<size_t>(num_var));
  if (bssn_cuda_interp_points_batch(BP->shape,
                                    BP->X[0], BP->X[1], BP->X[2],
@@ -286,7 +430,9 @@ bool run_cuda_interp_for_block(Block *BP,
 void run_cpu_interp_for_block(Block *BP,
                              const vector<InterpVarDesc> &vars,
                              const vector<int> &point_ids,
-                              double **XX,
+                              const vector<double> &px,
+                              const vector<double> &py,
+                              const vector<double> &pz,
                              double *Shellf,
                              int num_var,
                              int ordn,
@@ -295,9 +441,9 @@ void run_cpu_interp_for_block(Block *BP,
  for (size_t p = 0; p < point_ids.size(); ++p)
  {
    const int j = point_ids[p];
-    double x = XX[0][j];
-    double y = XX[1][j];
-    double z = XX[2][j];
+    double x = px[p];
+    double y = py[p];
+    double z = pz[p];
    int ordn_local = ordn;
    int symmetry_local = Symmetry;
    for (int v = 0; v < num_var; ++v)
@@ -310,33 +456,34 @@ void run_cpu_interp_for_block(Block *BP,
 }

 void interpolate_owned_points(MyList<var> *VarList,
-                              int NN, double **XX,
                              double *Shellf, int Symmetry,
-                              int myrank, int ordn,
+                              int ordn,
                              const BlockBinIndex &block_index,
-                              const int *owner_rank,
-                              const int *owner_block)
+                              const CachedInterpPlan &plan)
 {
  vector<InterpVarDesc> vars;
  collect_interp_vars(VarList, vars);
  const int num_var = static_cast<int>(vars.size());
-  vector<vector<int>> block_points(block_index.views.size());

-  for (int j = 0; j < NN; ++j)
+  for (size_t bi = 0; bi < plan.block_points.size(); ++bi)
  {
-    if (owner_rank[j] == myrank && owner_block[j] >= 0)
-      block_points[owner_block[j]].push_back(j);
-  }
-
-  for (size_t bi = 0; bi < block_points.size(); ++bi)
-  {
-    if (block_points[bi].empty())
+    if (plan.block_points[bi].empty())
      continue;

    Block *BP = block_index.views[bi].bp;
-    bool done = run_cuda_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry);
+    bool done = run_cuda_interp_for_block(BP, vars,
+                                          plan.block_points[bi],
+                                          plan.block_px[bi],
+                                          plan.block_py[bi],
+                                          plan.block_pz[bi],
+                                          Shellf, num_var, ordn, Symmetry);
    if (!done)
-      run_cpu_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry);
+      run_cpu_interp_for_block(BP, vars,
+                               plan.block_points[bi],
+                               plan.block_px[bi],
+                               plan.block_py[bi],
+                               plan.block_pz[bi],
+                               Shellf, num_var, ordn, Symmetry);
  }
 }
 } // namespace
@@ -684,55 +831,15 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // owner_rank[j] records which MPI rank owns point j
-  // All ranks traverse the same block list so they all agree on ownership
-  int *owner_rank;
-  owner_rank = new int[NN];
-  int *owner_block;
-  owner_block = new int[NN];
-  for (int j = 0; j < NN; j++)
-  {
-    owner_rank[j] = -1;
-    owner_block[j] = -1;
-  }
-
-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-
-  for (int j = 0; j < NN; j++) // run along points
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; i++)
-    {
-      pox[i] = XX[i][j];
-      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
-    {
-      Block *BP = block_index.views[block_i].bp;
-      owner_rank[j] = BP->rank;
-      owner_block[j] = block_i;
-    }
-  }
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+  BlockBinIndex block_index;
+  build_block_bin_index(this, DH, block_index);
+  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
+  const int *owner_rank = plan.owner_rank.data();

-  interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block);
+  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);

  // Replace MPI_Allreduce with per-owner MPI_Bcast:
  // Group consecutive points by owner rank and broadcast each group.
@@ -788,8 +895,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
    }
  }

-  delete[] owner_rank;
-  delete[] owner_block;
 }
 void Patch::Interp_Points(MyList<var> *VarList,
                          int NN, double **XX,
@@ -818,98 +923,22 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // owner_rank[j] records which MPI rank owns point j
-  int *owner_rank;
-  owner_rank = new int[NN];
-  int *owner_block;
-  owner_block = new int[NN];
-  for (int j = 0; j < NN; j++)
-  {
-    owner_rank[j] = -1;
-    owner_block[j] = -1;
-  }
-
-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-
-  // --- Interpolation phase (identical to original) ---
-  for (int j = 0; j < NN; j++)
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; i++)
-    {
-      pox[i] = XX[i][j];
-      if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
-    {
-      Block *BP = block_index.views[block_i].bp;
-      owner_rank[j] = BP->rank;
-      owner_block[j] = block_i;
-    }
-  }
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+  BlockBinIndex block_index;
+  build_block_bin_index(this, DH, block_index);
+  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false);
+  const int *owner_rank = plan.owner_rank.data();

-  interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block);
+  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);

 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
  double t_interp_local = t_interp_end - t_interp_start;
 #endif

-  // --- Error check for unfound points ---
-  for (int j = 0; j < NN; j++)
-  {
-    if (owner_rank[j] < 0 && myrank == 0)
-    {
-      cout << "ERROR: Patch::Interp_Points fails to find point (";
-      for (int d = 0; d < dim; d++)
-      {
-        cout << XX[d][j];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")";
-      }
-      cout << " on Patch (";
-      for (int d = 0; d < dim; d++)
-      {
-        cout << bbox[d] << "+" << lli[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")--";
-      }
-      cout << "(";
-      for (int d = 0; d < dim; d++)
-      {
-        cout << bbox[dim + d] << "-" << uui[d] * DH[d];
-        if (d < dim - 1)
-          cout << ",";
-        else
-          cout << ")" << endl;
-      }
-      MPI_Abort(MPI_COMM_WORLD, 1);
-    }
-  }
-
-  // --- Targeted point-to-point communication phase ---
+  // --- Targeted point-to-point communication phase ---
  // Compute consumer_rank[j] using the same deterministic formula as surface_integral
  int *consumer_rank = new int[NN];
  {
@@ -1028,8 +1057,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
  delete[] send_count;
  delete[] recv_count;
  delete[] consumer_rank;
-  delete[] owner_rank;
-  delete[] owner_block;

 #ifdef INTERP_LB_PROFILE
  {
@@ -1077,59 +1104,20 @@ void Patch::Interp_Points(MyList<var> *VarList,

  memset(Shellf, 0, sizeof(double) * NN * num_var);

-  // owner_rank[j] stores the global rank that owns point j
-  int *owner_rank;
-  owner_rank = new int[NN];
-  int *owner_block;
-  owner_block = new int[NN];
-  for (int j = 0; j < NN; j++)
-  {
-    owner_rank[j] = -1;
-    owner_block[j] = -1;
-  }
+  // Build global-to-local rank translation for Comm_here
+  MPI_Group world_group, local_group;
+  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+  MPI_Comm_group(Comm_here, &local_group);

-  // Build global-to-local rank translation for Comm_here
-  MPI_Group world_group, local_group;
-  MPI_Comm_group(MPI_COMM_WORLD, &world_group);
-  MPI_Comm_group(Comm_here, &local_group);
-
-  double DH[dim];
-  for (int i = 0; i < dim; i++)
-    DH[i] = getdX(i);
-  BlockBinIndex block_index;
-  build_block_bin_index(this, DH, block_index);
-
-  for (int j = 0; j < NN; j++) // run along points
-  {
-    double pox[dim];
-    for (int i = 0; i < dim; i++)
-    {
-      pox[i] = XX[i][j];
-      if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
-      {
-        cout << "Patch::Interp_Points: point (";
-        for (int k = 0; k < dim; k++)
-        {
-          cout << XX[k][j];
-          if (k < dim - 1)
-            cout << ",";
-          else
-            cout << ") is out of current Patch." << endl;
-        }
-        MPI_Abort(MPI_COMM_WORLD, 1);
-      }
-    }
-
-    const int block_i = find_block_index_for_point(block_index, pox, DH);
-    if (block_i >= 0)
-    {
-      Block *BP = block_index.views[block_i].bp;
-      owner_rank[j] = BP->rank;
-      owner_block[j] = block_i;
-    }
-  }
+  double DH[dim];
+  for (int i = 0; i < dim; i++)
+    DH[i] = getdX(i);
+  BlockBinIndex block_index;
+  build_block_bin_index(this, DH, block_index);
+  CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true);
+  const int *owner_rank = plan.owner_rank.data();

-  interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block);
+  interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan);

  // Collect unique global owner ranks and translate to local ranks in Comm_here
  // Then broadcast each owner's points via MPI_Bcast on Comm_here
@@ -1159,8 +1147,6 @@ void Patch::Interp_Points(MyList<var> *VarList,

  MPI_Group_free(&world_group);
  MPI_Group_free(&local_group);
-  delete[] owner_rank;
-  delete[] owner_block;
 }
 void Patch::checkBlock()
 {
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -953,14 +953,36 @@ int bssn_cuda_interp_points_batch(const int *ex,

  struct InterpBatchCache
  {
+    struct StencilCacheEntry
+    {
+      const double *X;
+      const double *Y;
+      const double *Z;
+      const double *px;
+      const double *py;
+      const double *pz;
+      int nx;
+      int ny;
+      int nz;
+      int num_points;
+      int ordn;
+      int symmetry;
+      CachedBuffer weights;
+      CachedIntBuffer indices;
+      CachedIntBuffer reflect;
+
+      StencilCacheEntry()
+          : X(nullptr), Y(nullptr), Z(nullptr),
+            px(nullptr), py(nullptr), pz(nullptr),
+            nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {}
+    };
+
    CachedBuffer out;
    CachedBuffer soa;
    CachedBuffer field_ptrs;
-    CachedBuffer weights;
-    CachedIntBuffer indices;
-    CachedIntBuffer reflect;
    CachedIntBuffer error_flag;
    std::vector<CachedBuffer> host_field_copies;
+    std::vector<StencilCacheEntry> stencil_entries;
  };
  static thread_local InterpBatchCache cache;

@@ -978,45 +1000,82 @@ int bssn_cuda_interp_points_batch(const int *ex,
  const size_t indices_bytes = point_stencil_ints * sizeof(int);

  bool ok = true;
-  std::vector<double> host_weights(point_stencil_doubles);
-  std::vector<int> host_indices(point_stencil_ints);
-  std::vector<int> host_reflect(point_stencil_ints);
-  const double dx = X[1] - X[0];
-  const double dy = Y[1] - Y[0];
-  const double dz = Z[1] - Z[0];
-  const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx);
-  const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy);
-  const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz);
-  for (int p = 0; p < num_points; ++p)
+  InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr;
+  for (size_t i = 0; i < cache.stencil_entries.size(); ++i)
  {
-    int start_x = 0, start_y = 0, start_z = 0;
-    double cx = 0.0, cy = 0.0, cz = 0.0;
-    const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx);
-    const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy);
-    const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz);
-    if (!ok_x || !ok_y || !ok_z)
-      return 1;
-
-    lagrange_weights_ord6_host(cx, host_weights.data() + p * 18);
-    lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6);
-    lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12);
-
-    for (int i = 0; i < 6; ++i)
+    InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i];
+    if (entry.X == X && entry.Y == Y && entry.Z == Z &&
+        entry.px == px && entry.py == py && entry.pz == pz &&
+        entry.nx == nx && entry.ny == ny && entry.nz == nz &&
+        entry.num_points == num_points && entry.ordn == ordn &&
+        entry.symmetry == symmetry)
    {
-      if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i]))
-        return 1;
-      if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i]))
-        return 1;
-      if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i]))
-        return 1;
+      stencil_cache = &entry;
+      break;
    }
  }

+  if (!stencil_cache)
+  {
+    cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry());
+    stencil_cache = &cache.stencil_entries.back();
+    stencil_cache->X = X;
+    stencil_cache->Y = Y;
+    stencil_cache->Z = Z;
+    stencil_cache->px = px;
+    stencil_cache->py = py;
+    stencil_cache->pz = pz;
+    stencil_cache->nx = nx;
+    stencil_cache->ny = ny;
+    stencil_cache->nz = nz;
+    stencil_cache->num_points = num_points;
+    stencil_cache->ordn = ordn;
+    stencil_cache->symmetry = symmetry;
+
+    std::vector<double> host_weights(point_stencil_doubles);
+    std::vector<int> host_indices(point_stencil_ints);
+    std::vector<int> host_reflect(point_stencil_ints);
+    const double dx = X[1] - X[0];
+    const double dy = Y[1] - Y[0];
+    const double dz = Z[1] - Z[0];
+    const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx);
+    const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy);
+    const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz);
+    for (int p = 0; p < num_points; ++p)
+    {
+      int start_x = 0, start_y = 0, start_z = 0;
+      double cx = 0.0, cy = 0.0, cz = 0.0;
+      const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx);
+      const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy);
+      const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz);
+      if (!ok_x || !ok_y || !ok_z)
+        return 1;
+
+      lagrange_weights_ord6_host(cx, host_weights.data() + p * 18);
+      lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6);
+      lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12);
+
+      for (int i = 0; i < 6; ++i)
+      {
+        if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i]))
+          return 1;
+        if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i]))
+          return 1;
+        if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i]))
+          return 1;
+      }
+    }
+
+    ok = ok &&
+         copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) &&
+         copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) &&
+         copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes);
+    if (!ok)
+      return 1;
+  }
+
  ok = ok &&
       copy_to_device(cache.soa, soa_flat, soa_bytes) &&
-       copy_to_device(cache.weights, host_weights.data(), weights_bytes) &&
-       copy_to_device(cache.indices, host_indices.data(), indices_bytes) &&
-       copy_to_device(cache.reflect, host_reflect.data(), indices_bytes) &&
       ensure_capacity(cache.out, out_bytes) &&
       ensure_capacity(cache.field_ptrs, ptr_bytes) &&
       ensure_capacity(cache.error_flag, sizeof(int));
@@ -1063,9 +1122,9 @@ int bssn_cuda_interp_points_batch(const int *ex,
  int ny_local = ny;
  const double *dsoa = cache.soa.ptr;
  const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
-  const double *dweights = cache.weights.ptr;
-  const int *dindices = cache.indices.ptr;
-  const int *dreflect = cache.reflect.ptr;
+  const double *dweights = stencil_cache->weights.ptr;
+  const int *dindices = stencil_cache->indices.ptr;
+  const int *dreflect = stencil_cache->reflect.ptr;
  double *dout = cache.out.ptr;
  int *derror = cache.error_flag.ptr;

--- a/AMSS_NCKU_source/surface_integral.C
+++ b/AMSS_NCKU_source/surface_integral.C
@@ -180,19 +180,64 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry)
 //|============================================================================
 //| Destructor
 //|============================================================================
-surface_integral::~surface_integral()
-{
-  delete[] nx_g;
-  delete[] ny_g;
-  delete[] nz_g;
-  delete[] arcostheta;
-#ifdef GaussInt
-  delete[] wtcostheta;
-#endif
-}
-//|----------------------------------------------------------------
-//  spin weighted spinw component of psi4, general routine
-//  l takes from spinw to maxl; m takes from -l to l
+surface_integral::~surface_integral()
+{
+  release_cached_buffers();
+  delete[] nx_g;
+  delete[] ny_g;
+  delete[] nz_g;
+  delete[] arcostheta;
+#ifdef GaussInt
+  delete[] wtcostheta;
+#endif
+}
+
+void surface_integral::get_surface_points(double rex, double **pox)
+{
+  SpherePointCache &cache = sphere_point_cache[rex];
+  if (!cache.pox[0])
+  {
+    for (int i = 0; i < 3; ++i)
+      cache.pox[i] = new double[n_tot];
+    for (int n = 0; n < n_tot; ++n)
+    {
+      cache.pox[0][n] = rex * nx_g[n];
+      cache.pox[1][n] = rex * ny_g[n];
+      cache.pox[2][n] = rex * nz_g[n];
+    }
+  }
+
+  pox[0] = cache.pox[0];
+  pox[1] = cache.pox[1];
+  pox[2] = cache.pox[2];
+}
+
+double *surface_integral::get_shellf_buffer(int num_var)
+{
+  double *&buffer = shellf_cache[num_var];
+  if (!buffer)
+    buffer = new double[n_tot * num_var];
+  return buffer;
+}
+
+void surface_integral::release_cached_buffers()
+{
+  for (map<double, SpherePointCache>::iterator it = sphere_point_cache.begin(); it != sphere_point_cache.end(); ++it)
+  {
+    delete[] it->second.pox[0];
+    delete[] it->second.pox[1];
+    delete[] it->second.pox[2];
+    it->second.pox[0] = it->second.pox[1] = it->second.pox[2] = 0;
+  }
+  sphere_point_cache.clear();
+
+  for (map<int, double *>::iterator it = shellf_cache.begin(); it != shellf_cache.end(); ++it)
+    delete[] it->second;
+  shellf_cache.clear();
+}
+//|----------------------------------------------------------------
+//  spin weighted spinw component of psi4, general routine
+//  l takes from spinw to maxl; m takes from -l to l
 //|----------------------------------------------------------------
 void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
@@ -209,16 +254,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
+  int n;
+  double *pox[3];
+  get_surface_points(rex, pox);

  int mp, Lp, Nmin, Nmax;
  mp = n_tot / cpusize;
@@ -234,8 +272,7 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
    Nmax = Nmin + mp - 1;
  }

-  double *shellf;
-  shellf = new double[n_tot * InList];
+  double *shellf = get_shellf_buffer(InList);

  GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);

@@ -375,14 +412,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *

  //|------= Free memory.

-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
 void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,
                                 int spinw, int maxl, int NN, double *RP, double *IP,
                                 monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP
@@ -402,19 +435,11 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
+  int n;
+  double *pox[3];
+  get_surface_points(rex, pox);
+
+  double *shellf = get_shellf_buffer(InList);

  //    misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points");

@@ -577,14 +602,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *

  //|------= Free memory.

-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  delete[] RP_out;
-  delete[] IP_out;
-  DG_List->clearList();
-}
+  delete[] RP_out;
+  delete[] IP_out;
+  DG_List->clearList();
+}
 //|----------------------------------------------------------------
 //  for shell patch
 //|----------------------------------------------------------------
@@ -597,19 +618,11 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4
  MyList<var> *DG_List = new MyList<var>(Rpsi4);
  DG_List->insert(Ipsi4);

-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
+  int n;
+  double *pox[3];
+  get_surface_points(rex, pox);

-  double *shellf;
-  shellf = new double[n_tot * InList];
+  double *shellf = get_shellf_buffer(InList);

  GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry);

@@ -2570,12 +2583,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  Rout[5] = sy;
  Rout[6] = sz;

-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  DG_List->clearList();
-}
+  DG_List->clearList();
+}
 void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK,
                                     var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz,
                                     var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz,
@@ -2637,19 +2646,11 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  DG_List->insert(Ayz);
  DG_List->insert(Azz);

-  int n;
-  double *pox[3];
-  for (int i = 0; i < 3; i++)
-    pox[i] = new double[n_tot];
-  for (n = 0; n < n_tot; n++)
-  {
-    pox[0][n] = rex * nx_g[n];
-    pox[1][n] = rex * ny_g[n];
-    pox[2][n] = rex * nz_g[n];
-  }
-
-  double *shellf;
-  shellf = new double[n_tot * InList];
+  int n;
+  double *pox[3];
+  get_surface_points(rex, pox);
+
+  double *shellf = get_shellf_buffer(InList);

  // we have assumed there is only one box on this level,
  // so we do not need loop boxes
@@ -2839,12 +2840,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var
  Rout[5] = sy;
  Rout[6] = sz;

-  delete[] pox[0];
-  delete[] pox[1];
-  delete[] pox[2];
-  delete[] shellf;
-  DG_List->clearList();
-}
+  DG_List->clearList();
+}
 //|----------------------------------------------------------------
 //  for shell patch
 //|----------------------------------------------------------------
--- a/AMSS_NCKU_source/surface_integral.h
+++ b/AMSS_NCKU_source/surface_integral.h
@@ -20,25 +20,41 @@ using namespace std;
 #include "cgh.h"
 #include "ShellPatch.h"
 #include "NullShellPatch.h"
-#include "NullShellPatch2.h"
-#include "var.h"
-#include "monitor.h"
+#include "NullShellPatch2.h"
+#include "var.h"
+#include "monitor.h"
+#include <map>

 class surface_integral
 {

-private:
-	int Symmetry, factor;
-	int N_theta, N_phi; // Number of points in Theta & Phi directions
-	double dphi, dcostheta;
-	double *arcostheta, *wtcostheta;
-	int n_tot; // size of arrays
-
-	double *nx_g, *ny_g, *nz_g; // global list of unit normals
-	int myrank, cpusize;
-
-public:
-	surface_integral(int iSymmetry);
+private:
+	struct SpherePointCache
+	{
+		double *pox[3];
+		SpherePointCache()
+		{
+			pox[0] = pox[1] = pox[2] = 0;
+		}
+	};
+
+	int Symmetry, factor;
+	int N_theta, N_phi; // Number of points in Theta & Phi directions
+	double dphi, dcostheta;
+	double *arcostheta, *wtcostheta;
+	int n_tot; // size of arrays
+
+	double *nx_g, *ny_g, *nz_g; // global list of unit normals
+	int myrank, cpusize;
+	map<double, SpherePointCache> sphere_point_cache;
+	map<int, double *> shellf_cache;
+
+	void get_surface_points(double rex, double **pox);
+	double *get_shellf_buffer(int num_var);
+	void release_cached_buffers();
+
+public:
+	surface_integral(int iSymmetry);
 	~surface_integral();

 	void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,