diff --git a/AMSS_NCKU_source/ShellPatch.C b/AMSS_NCKU_source/ShellPatch.C
index 2f092d5..a2f5843 100644
--- a/AMSS_NCKU_source/ShellPatch.C
+++ b/AMSS_NCKU_source/ShellPatch.C
@@ -5,24 +5,971 @@
 #include <cstdlib>
 #include <cstdio>
 #include <string>
-#include <cmath>
-#include <new>
-using namespace std;
+#include <cmath>
+#include <new>
+#include <map>
+#include <vector>
+#include <thread>
+#include <algorithm>
+using namespace std;
 
 #include "ShellPatch.h"
 #include "Parallel.h"
 #include "fmisc.h"
 #include "misc.h"
 #include "shellfunctions.h"
-#include "parameters.h"
-
-#define PI M_PI
+#include "parameters.h"
+
+#define PI M_PI
+
+#if USE_CUDA_BSSN
+extern "C" int bssn_cuda_shell_pack_host_fields(int npoints,
+                                                int nvars,
+                                                int nblocks,
+                                                int ordn,
+                                                double **block_var_fields,
+                                                int *block_shapes,
+                                                int *point_block,
+                                                int *point_dimh,
+                                                int *point_dumyd,
+                                                int *point_sind,
+                                                double *point_coef,
+                                                double *out);
+extern "C" void bssn_cuda_shell_pack_cache_begin();
+extern "C" void bssn_cuda_shell_pack_cache_end();
+#endif
 
 // x   x   x   x   x   o   *
 //             *   o   x   x   x   x   x
 // each side contribute an overlap points
 // so we need half of that
-#define overghost ((ghost_width + 1) / 2 + ghost_width)
+#define overghost ((ghost_width + 1) / 2 + ghost_width)
+
+namespace {
+
+bool shell_fast_interp_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_SHELL_FAST_INTERP");
+    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool shell_parallel_interp_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP");
+    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool shell_cuda_interp_enabled()
+{
+#if USE_CUDA_BSSN
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_SHELL_CUDA_INTERP");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+#else
+  return false;
+#endif
+}
+
+bool shell_interp_stats_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_SHELL_INTERP_STATS");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool shell_transfer_timing_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_SHELL_TRANSFER_TIMING");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+int shell_interp_threads()
+{
+  static int threads = -1;
+  if (threads < 0)
+  {
+    const char *env = getenv("AMSS_SHELL_INTERP_THREADS");
+    int requested = env ? atoi(env) : 0;
+    if (requested <= 0)
+    {
+      unsigned int hw = std::thread::hardware_concurrency();
+      requested = hw > 0 ? (int)hw : 8;
+      requested = std::min(requested, 32);
+    }
+    threads = std::max(1, requested);
+  }
+  return threads;
+}
+
+int shell_var_count(MyList<var> *VarList)
+{
+  int count = 0;
+  while (VarList)
+  {
+    ++count;
+    VarList = VarList->next;
+  }
+  return count;
+}
+
+struct ShellInterpStats
+{
+  long long pack_dim3 = 0;
+  long long pack_dim2 = 0;
+  long long pack_dim1 = 0;
+  long long fast_dim3 = 0;
+  long long fast_dim2 = 0;
+  long long fast_dim1 = 0;
+  long long fallback_dim3 = 0;
+  long long fallback_dim2 = 0;
+  long long fallback_dim1 = 0;
+};
+
+ShellInterpStats shell_interp_stats;
+
+void shell_note_pack(int dimh, bool fast_done)
+{
+  if (!shell_interp_stats_enabled())
+    return;
+  if (dimh == 3)
+  {
+    shell_interp_stats.pack_dim3++;
+    fast_done ? shell_interp_stats.fast_dim3++ : shell_interp_stats.fallback_dim3++;
+  }
+  else if (dimh == 2)
+  {
+    shell_interp_stats.pack_dim2++;
+    fast_done ? shell_interp_stats.fast_dim2++ : shell_interp_stats.fallback_dim2++;
+  }
+  else if (dimh == 1)
+  {
+    shell_interp_stats.pack_dim1++;
+    fast_done ? shell_interp_stats.fast_dim1++ : shell_interp_stats.fallback_dim1++;
+  }
+}
+
+void shell_print_interp_stats(const char *label)
+{
+  if (!shell_interp_stats_enabled())
+    return;
+
+  int rank = 0;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  cout << "[AMSS-SHELL-INTERP-STATS] rank=" << rank << " " << label
+       << " pack3=" << shell_interp_stats.pack_dim3
+       << " fast3=" << shell_interp_stats.fast_dim3
+       << " fallback3=" << shell_interp_stats.fallback_dim3
+       << " pack2=" << shell_interp_stats.pack_dim2
+       << " fast2=" << shell_interp_stats.fast_dim2
+       << " fallback2=" << shell_interp_stats.fallback_dim2
+       << " pack1=" << shell_interp_stats.pack_dim1
+       << " fast1=" << shell_interp_stats.fast_dim1
+       << " fallback1=" << shell_interp_stats.fallback_dim1
+       << endl;
+}
+
+inline int shell_idx3(const int *shape, int i, int j, int k)
+{
+  return i + shape[0] * (j + shape[1] * k);
+}
+
+bool shell_reflect_index(int idx, int n, int ordn, int &mapped, bool &reflected)
+{
+  reflected = false;
+  if (idx >= 0 && idx < n)
+  {
+    mapped = idx;
+    return true;
+  }
+  if (idx < 0)
+  {
+    if (idx < -ordn)
+      return false;
+    mapped = -idx;
+    reflected = true;
+    return mapped >= 0 && mapped < n;
+  }
+  if (idx <= n + ordn - 1)
+  {
+    mapped = 2 * n - 2 - idx;
+    reflected = true;
+    return mapped >= 0 && mapped < n;
+  }
+  return false;
+}
+
+void shell_symmetry_soa3(const var *vp, int sst, double soa[3])
+{
+  if (sst == 2 || sst == 3)
+  {
+    soa[0] = vp->SoA[1];
+    soa[1] = vp->SoA[2];
+    soa[2] = 0.0;
+  }
+  else if (sst == 4 || sst == 5)
+  {
+    soa[0] = vp->SoA[0];
+    soa[1] = vp->SoA[2];
+    soa[2] = 0.0;
+  }
+  else
+  {
+    soa[0] = vp->SoA[0];
+    soa[1] = vp->SoA[1];
+    soa[2] = (sst == -1) ? vp->SoA[2] : 0.0;
+  }
+}
+
+double shell_symmetry_soa1(const var *vp, int sst, int dumyd)
+{
+  if (dumyd == 1)
+  {
+    if (sst == 2 || sst == 3)
+      return vp->SoA[1];
+    return vp->SoA[0];
+  }
+  if (dumyd == 0)
+  {
+    if (sst == 0 || sst == 1)
+      return vp->SoA[1];
+    return vp->SoA[2];
+  }
+  return 1.0;
+}
+
+bool shell_interp3d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
+{
+  const int *shape = pt->Bg->shape;
+  const int *s = pt->sind;
+  if (!s || !pt->coef)
+    return false;
+  if (s[0] < 0 || s[1] < 0 || s[2] < 0 ||
+      s[0] + ordn > shape[0] ||
+      s[1] + ordn > shape[1] ||
+      s[2] + ordn > shape[2])
+    return false;
+
+  const double *f = pt->Bg->fgfs[vp->sgfn];
+  const double *cx = pt->coef;
+  const double *cy = pt->coef + ordn;
+  const double *cz = pt->coef + 2 * ordn;
+  double sum = 0.0;
+  for (int k = 0; k < ordn; ++k)
+    for (int j = 0; j < ordn; ++j)
+      for (int i = 0; i < ordn; ++i)
+        sum += cx[i] * cy[j] * cz[k] *
+               f[shell_idx3(shape, s[0] + i, s[1] + j, s[2] + k)];
+  out = sum;
+  return true;
+}
+
+bool shell_interp2d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
+{
+  const int *shape = pt->Bg->shape;
+  const int *s = pt->sind;
+  if (!s || !pt->coef)
+    return false;
+  const int k0 = s[2] - 1; // Match global_interpind2d's Fortran fixed-index convention.
+  if (s[0] < 0 || s[1] < 0 ||
+      s[0] + ordn > shape[0] ||
+      s[1] + ordn > shape[1] ||
+      k0 < 0 || k0 >= shape[2])
+    return false;
+
+  const double *f = pt->Bg->fgfs[vp->sgfn];
+  const double *cx = pt->coef;
+  const double *cy = pt->coef + ordn;
+  double sum = 0.0;
+  for (int j = 0; j < ordn; ++j)
+    for (int i = 0; i < ordn; ++i)
+      sum += cx[i] * cy[j] * f[shell_idx3(shape, s[0] + i, s[1] + j, k0)];
+  out = sum;
+  return true;
+}
+
+bool shell_interp1d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn)
+{
+  const int *shape = pt->Bg->shape;
+  const int *s = pt->sind;
+  if (!s || !pt->coef)
+    return false;
+
+  const double *f = pt->Bg->fgfs[vp->sgfn];
+  double sum = 0.0;
+  if (pt->dumyd == 1)
+  {
+    if (s[0] < 0 || s[0] + ordn > shape[0] ||
+        s[1] < 0 || s[1] >= shape[1] ||
+        s[2] < 0 || s[2] >= shape[2])
+      return false;
+    for (int i = 0; i < ordn; ++i)
+      sum += pt->coef[i] * f[shell_idx3(shape, s[0] + i, s[1], s[2])];
+  }
+  else if (pt->dumyd == 0)
+  {
+    if (s[0] < 0 || s[0] + ordn > shape[1] ||
+        s[1] < 0 || s[1] >= shape[0] ||
+        s[2] < 0 || s[2] >= shape[2])
+      return false;
+    for (int j = 0; j < ordn; ++j)
+      sum += pt->coef[j] * f[shell_idx3(shape, s[1], s[0] + j, s[2])];
+  }
+  else
+    return false;
+
+  out = sum;
+  return true;
+}
+
+bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn)
+{
+  const int DIMh = (pt->dumyd == -1) ? dim : 1;
+  const int *shape = pt->Bg->shape;
+  const int *s = pt->sind;
+  if (!s || !pt->coef)
+    return false;
+
+  if (DIMh == 3)
+  {
+    if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 ||
+        s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 ||
+        s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1)
+      return false;
+    if (pt->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2]))
+      return false;
+    return true;
+  }
+
+  if (DIMh == 1)
+  {
+    if (pt->dumyd == 1)
+      return s[0] >= -ordn && s[0] + ordn - 1 <= shape[0] + ordn - 1 &&
+             s[1] >= 0 && s[1] < shape[1] &&
+             s[2] >= 0 && s[2] < shape[2];
+    if (pt->dumyd == 0)
+      return s[0] >= -ordn && s[0] + ordn - 1 <= shape[1] + ordn - 1 &&
+             s[1] >= 0 && s[1] < shape[0] &&
+             s[2] >= 0 && s[2] < shape[2];
+  }
+
+  return false;
+}
+
+bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx);
+
+bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double value)
+{
+  int idx = -1;
+  if (!shell_pointcopy_index(pt, idx))
+    return false;
+  pt->Bg->fgfs[vp->sgfn][idx] = value;
+  return true;
+}
+
+bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx)
+{
+  Block *bg = pt->Bg;
+  const int *shape = bg->shape;
+  if (shape[0] <= 1 || shape[1] <= 1 || shape[2] <= 1)
+    return false;
+  double h[3];
+  for (int d = 0; d < dim; ++d)
+#ifdef Vertex
+    h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / (shape[d] - 1);
+#else
+    h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d];
+#endif
+  int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4);
+  int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4);
+  int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4);
+  if (i < 0 || i >= shape[0] || j < 0 || j >= shape[1] || k < 0 || k >= shape[2])
+    return false;
+  idx = shell_idx3(shape, i, j, k);
+  return true;
+}
+
+bool shell_pointcopy_possible(const ShellPatch::pointstru *pt)
+{
+  int idx = -1;
+  return shell_pointcopy_index(pt, idx);
+}
+
+bool shell_pack3d_fast_all(double *data, int out_base, ShellPatch::pointstru *src,
+                           const std::vector<var *> &vars, int ordn)
+{
+  const int *shape = src->Bg->shape;
+  const int *s = src->sind;
+  if (!s || !src->coef)
+    return false;
+  if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 ||
+      s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 ||
+      s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1)
+    return false;
+  if (src->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2]))
+    return false;
+
+  const int nst = ordn * ordn * ordn;
+  if (ordn <= 0 || nst > 1728)
+    return false;
+  double weights[1728];
+  int offsets[1728];
+  unsigned char reflect_mask[1728];
+  const double *cx = src->coef;
+  const double *cy = src->coef + ordn;
+  const double *cz = src->coef + 2 * ordn;
+  int q = 0;
+  for (int k = 0; k < ordn; ++k)
+  {
+    int mk = 0;
+    bool rk = false;
+    if (!shell_reflect_index(s[2] + k, shape[2], ordn, mk, rk))
+      return false;
+    const double wz = cz[k];
+    const int zk = mk * shape[0] * shape[1];
+    for (int j = 0; j < ordn; ++j)
+    {
+      int mj = 0;
+      bool rj = false;
+      if (!shell_reflect_index(s[1] + j, shape[1], ordn, mj, rj))
+        return false;
+      const double wyz = cy[j] * wz;
+      const int yj = mj * shape[0];
+      for (int i = 0; i < ordn; ++i)
+      {
+        int mi = 0;
+        bool ri = false;
+        if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri))
+          return false;
+        weights[q] = cx[i] * wyz;
+        offsets[q] = mi + yj + zk;
+        reflect_mask[q] = (ri ? 1 : 0) | (rj ? 2 : 0) | (rk ? 4 : 0);
+        ++q;
+      }
+    }
+  }
+
+  for (size_t vi = 0; vi < vars.size(); ++vi)
+  {
+    const double *f = src->Bg->fgfs[vars[vi]->sgfn];
+    double soa[3];
+    shell_symmetry_soa3(vars[vi], src->ssst, soa);
+    double sum = 0.0;
+    for (int p = 0; p < nst; ++p)
+    {
+      double fac = 1.0;
+      if (reflect_mask[p] & 1) fac *= soa[0];
+      if (reflect_mask[p] & 2) fac *= soa[1];
+      if (reflect_mask[p] & 4) fac *= soa[2];
+      sum += weights[p] * fac * f[offsets[p]];
+    }
+    data[out_base + (int)vi] = sum;
+    shell_note_pack(3, true);
+  }
+  return true;
+}
+
+bool shell_pack2d_fast_all(double *data, int out_base, ShellPatch::pointstru *src,
+                           const std::vector<var *> &vars, int ordn)
+{
+  const int *shape = src->Bg->shape;
+  const int *s = src->sind;
+  if (!s || !src->coef)
+    return false;
+  const int k0 = s[2] - 1;
+  if (s[0] < 0 || s[1] < 0 ||
+      s[0] + ordn > shape[0] ||
+      s[1] + ordn > shape[1] ||
+      k0 < 0 || k0 >= shape[2])
+    return false;
+
+  const int nst = ordn * ordn;
+  if (ordn <= 0 || nst > 144)
+    return false;
+  double weights[144];
+  int offsets[144];
+  const double *cx = src->coef;
+  const double *cy = src->coef + ordn;
+  int q = 0;
+  const int zk = k0 * shape[0] * shape[1];
+  for (int j = 0; j < ordn; ++j)
+  {
+    const int yj = (s[1] + j) * shape[0];
+    for (int i = 0; i < ordn; ++i)
+    {
+      weights[q] = cx[i] * cy[j];
+      offsets[q] = s[0] + i + yj + zk;
+      ++q;
+    }
+  }
+
+  for (size_t vi = 0; vi < vars.size(); ++vi)
+  {
+    const double *f = src->Bg->fgfs[vars[vi]->sgfn];
+    double sum = 0.0;
+    for (int p = 0; p < nst; ++p)
+      sum += weights[p] * f[offsets[p]];
+    data[out_base + (int)vi] = sum;
+    shell_note_pack(2, true);
+  }
+  return true;
+}
+
+bool shell_pack1d_fast_all(double *data, int out_base, ShellPatch::pointstru *src,
+                           const std::vector<var *> &vars, int ordn)
+{
+  const int *shape = src->Bg->shape;
+  const int *s = src->sind;
+  if (!s || !src->coef)
+    return false;
+  if (ordn <= 0 || ordn > 12)
+    return false;
+
+  int offsets[12];
+  unsigned char reflected[12];
+  if (src->dumyd == 1)
+  {
+    if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 ||
+        s[1] < 0 || s[1] >= shape[1] ||
+        s[2] < 0 || s[2] >= shape[2])
+      return false;
+    const int base = s[1] * shape[0] + s[2] * shape[0] * shape[1];
+    for (int i = 0; i < ordn; ++i)
+    {
+      int mi = 0;
+      bool ri = false;
+      if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri))
+        return false;
+      offsets[i] = mi + base;
+      reflected[i] = ri ? 1 : 0;
+    }
+  }
+  else if (src->dumyd == 0)
+  {
+    if (s[0] < -ordn || s[0] + ordn - 1 > shape[1] + ordn - 1 ||
+        s[1] < 0 || s[1] >= shape[0] ||
+        s[2] < 0 || s[2] >= shape[2])
+      return false;
+    const int base = s[1] + s[2] * shape[0] * shape[1];
+    for (int j = 0; j < ordn; ++j)
+    {
+      int mj = 0;
+      bool rj = false;
+      if (!shell_reflect_index(s[0] + j, shape[1], ordn, mj, rj))
+        return false;
+      offsets[j] = base + mj * shape[0];
+      reflected[j] = rj ? 1 : 0;
+    }
+  }
+  else
+    return false;
+
+  for (size_t vi = 0; vi < vars.size(); ++vi)
+  {
+    const double *f = src->Bg->fgfs[vars[vi]->sgfn];
+    const double soa = shell_symmetry_soa1(vars[vi], src->ssst, src->dumyd);
+    double sum = 0.0;
+    for (int p = 0; p < ordn; ++p)
+      sum += src->coef[p] * (reflected[p] ? soa : 1.0) * f[offsets[p]];
+    data[out_base + (int)vi] = sum;
+    shell_note_pack(1, true);
+  }
+  return true;
+}
+
+struct ShellPointPair
+{
+  ShellPatch::pointstru *src;
+  ShellPatch::pointstru *dst;
+};
+
+struct ShellActiveCache
+{
+  MyList<ShellPatch::pointstru> *src = 0;
+  MyList<ShellPatch::pointstru> *dst = 0;
+  MyList<var> *var_src = 0;
+  MyList<var> *var_dst = 0;
+  int rank_in = -1;
+  int dir = -1;
+  int myrank = -1;
+  bool valid = false;
+  int ordn = -1;
+  std::vector<var *> src_vars;
+  std::vector<var *> dst_vars;
+  std::vector<ShellPointPair> active_points;
+  std::vector<char> fast_points;
+  std::vector<int> dst_indices;
+};
+
+std::vector<ShellActiveCache> shell_active_caches;
+
+void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn);
+
+bool shell_active_cache_matches(MyList<ShellPatch::pointstru> *src,
+                                MyList<ShellPatch::pointstru> *dst,
+                                int rank_in,
+                                int dir,
+                                MyList<var> *var_src,
+                                MyList<var> *var_dst,
+                                int myrank,
+                                int ordn,
+                                const ShellActiveCache &cache)
+{
+  return cache.valid &&
+         cache.src == src &&
+         cache.dst == dst &&
+         cache.rank_in == rank_in &&
+         cache.dir == dir &&
+         cache.var_src == var_src &&
+         cache.var_dst == var_dst &&
+         cache.myrank == myrank &&
+         cache.ordn == ordn;
+}
+
+ShellActiveCache &shell_get_active_cache(MyList<ShellPatch::pointstru> *src,
+                                         MyList<ShellPatch::pointstru> *dst,
+                                         int rank_in,
+                                         int dir,
+                                         MyList<var> *VarLists,
+                                         MyList<var> *VarListd,
+                                         int myrank,
+                                         int ordn)
+{
+  for (size_t c = 0; c < shell_active_caches.size(); ++c)
+    if (shell_active_cache_matches(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn, shell_active_caches[c]))
+      return shell_active_caches[c];
+
+  shell_active_caches.push_back(ShellActiveCache());
+  ShellActiveCache &cache = shell_active_caches.back();
+  cache.src = src;
+  cache.dst = dst;
+  cache.var_src = VarLists;
+  cache.var_dst = VarListd;
+  cache.rank_in = rank_in;
+  cache.dir = dir;
+  cache.myrank = myrank;
+  cache.ordn = ordn;
+  cache.valid = true;
+
+  for (MyList<var> *varls = VarLists, *varld = VarListd;
+       varls && varld;
+       varls = varls->next, varld = varld->next)
+  {
+    cache.src_vars.push_back(varls->data);
+    cache.dst_vars.push_back(varld->data);
+  }
+
+  MyList<ShellPatch::pointstru> *src_scan = src;
+  MyList<ShellPatch::pointstru> *dst_scan = dst;
+  while (src_scan && dst_scan)
+  {
+    if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) ||
+        (dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank))
+    {
+      ShellPointPair pair;
+      pair.src = src_scan->data;
+      pair.dst = dst_scan->data;
+      cache.active_points.push_back(pair);
+    }
+    src_scan = src_scan->next;
+    dst_scan = dst_scan->next;
+  }
+
+  cache.fast_points.assign(cache.active_points.size(), 0);
+  cache.dst_indices.assign(cache.active_points.size(), -1);
+  if (dir == PACK && shell_fast_interp_enabled())
+  {
+    for (size_t p = 0; p < cache.active_points.size(); ++p)
+    {
+      shell_prepare_interp_coeffs(cache.active_points[p].src, ordn);
+      cache.fast_points[p] = shell_interp_fast_possible(cache.active_points[p].src, ordn) ? 1 : 0;
+    }
+  }
+  else if (dir == UNPACK)
+  {
+    for (size_t p = 0; p < cache.active_points.size(); ++p)
+    {
+      int idx = -1;
+      if (shell_pointcopy_index(cache.active_points[p].dst, idx))
+      {
+        cache.fast_points[p] = 1;
+        cache.dst_indices[p] = idx;
+      }
+    }
+  }
+
+  return cache;
+}
+
+bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src,
+                          const std::vector<var *> &vars, int ordn)
+{
+  const int DIMh = (src->dumyd == -1) ? dim : 1;
+  if (DIMh == 3)
+    return shell_pack3d_fast_all(data, out_base, src, vars, ordn);
+  if (DIMh == 2)
+    return shell_pack2d_fast_all(data, out_base, src, vars, ordn);
+  if (DIMh == 1)
+    return shell_pack1d_fast_all(data, out_base, src, vars, ordn);
+  return false;
+}
+
+bool shell_pack_cuda_fast_points(double *data,
+                                 const std::vector<ShellPointPair> &active_points,
+                                 const std::vector<var *> &vars,
+                                 const std::vector<char> &fast_points,
+                                 int ordn)
+{
+#if USE_CUDA_BSSN
+  if (!shell_cuda_interp_enabled() || active_points.empty() || vars.empty())
+    return false;
+
+  std::vector<int> point_to_active;
+  point_to_active.reserve(active_points.size());
+  std::map<Block *, int> block_index;
+  std::vector<Block *> blocks;
+
+  for (size_t p = 0; p < active_points.size(); ++p)
+  {
+    if (!fast_points[p])
+      continue;
+    ShellPatch::pointstru *src = active_points[p].src;
+    const int DIMh = (src->dumyd == -1) ? dim : 1;
+    if (DIMh != 3 && DIMh != 1)
+      continue;
+    point_to_active.push_back((int)p);
+    if (block_index.find(src->Bg) == block_index.end())
+    {
+      int next = (int)blocks.size();
+      block_index[src->Bg] = next;
+      blocks.push_back(src->Bg);
+    }
+  }
+
+  const int npoints = (int)point_to_active.size();
+  const int nvars = (int)vars.size();
+  const int nblocks = (int)blocks.size();
+  if (npoints <= 0 || nvars <= 0 || nblocks <= 0)
+    return false;
+
+  std::vector<double *> block_var_fields((size_t)nblocks * nvars, 0);
+  std::vector<int> block_shapes((size_t)nblocks * 3, 0);
+  for (int b = 0; b < nblocks; ++b)
+  {
+    Block *bg = blocks[b];
+    block_shapes[3 * b + 0] = bg->shape[0];
+    block_shapes[3 * b + 1] = bg->shape[1];
+    block_shapes[3 * b + 2] = bg->shape[2];
+    for (int v = 0; v < nvars; ++v)
+      block_var_fields[(size_t)b * nvars + v] = bg->fgfs[vars[v]->sgfn];
+  }
+
+  std::vector<int> point_block(npoints, 0);
+  std::vector<int> point_dimh(npoints, 0);
+  std::vector<int> point_dumyd(npoints, -1);
+  std::vector<int> point_sind((size_t)npoints * 3, 0);
+  std::vector<double> point_coef((size_t)npoints * 3 * ordn, 0.0);
+  std::vector<double> out((size_t)npoints * nvars, 0.0);
+
+  for (int q = 0; q < npoints; ++q)
+  {
+    const int p = point_to_active[q];
+    ShellPatch::pointstru *src = active_points[p].src;
+    const int DIMh = (src->dumyd == -1) ? dim : 1;
+    point_block[q] = block_index[src->Bg];
+    point_dimh[q] = DIMh;
+    point_dumyd[q] = src->dumyd;
+    point_sind[(size_t)q * 3 + 0] = src->sind[0];
+    point_sind[(size_t)q * 3 + 1] = src->sind[1];
+    point_sind[(size_t)q * 3 + 2] = src->sind[2];
+    const int coef_count = (DIMh == 3) ? 3 * ordn : ordn;
+    for (int c = 0; c < coef_count; ++c)
+      point_coef[(size_t)q * 3 * ordn + c] = src->coef[c];
+  }
+
+  int rc = bssn_cuda_shell_pack_host_fields(npoints, nvars, nblocks, ordn,
+                                            block_var_fields.data(),
+                                            block_shapes.data(),
+                                            point_block.data(),
+                                            point_dimh.data(),
+                                            point_dumyd.data(),
+                                            point_sind.data(),
+                                            point_coef.data(),
+                                            out.data());
+  if (rc != 0)
+    return false;
+
+  for (int q = 0; q < npoints; ++q)
+  {
+    const int p = point_to_active[q];
+    const int base = p * nvars;
+    for (int v = 0; v < nvars; ++v)
+    {
+      data[base + v] = out[(size_t)q * nvars + v];
+      shell_note_pack(point_dimh[q], true);
+    }
+  }
+  return true;
+#else
+  (void)data;
+  (void)active_points;
+  (void)vars;
+  (void)fast_points;
+  (void)ordn;
+  return false;
+#endif
+}
+
+void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn)
+{
+  if (pt->coef)
+    return;
+
+  const int DIMh = (pt->dumyd == -1) ? dim : 1;
+  pt->coef = new double[ordn * DIMh];
+  pt->sind = new int[dim];
+  if (DIMh == 3)
+  {
+    for (int i = 0; i < DIMh; i++)
+    {
+      double dd = pt->Bg->getdX(i);
+      pt->sind[i] = int((pt->lpox[i] - pt->Bg->X[i][0]) / dd) - ordn / 2 + 1;
+      double h1, h2;
+      for (int j = 0; j < ordn; j++)
+      {
+        h1 = pt->Bg->X[i][0] + (pt->sind[i] + j) * dd;
+        pt->coef[i * ordn + j] = 1;
+        for (int k = 0; k < j; k++)
+        {
+          h2 = pt->Bg->X[i][0] + (pt->sind[i] + k) * dd;
+          pt->coef[i * ordn + j] *= (pt->lpox[i] - h2) / (h1 - h2);
+        }
+        for (int k = j + 1; k < ordn; k++)
+        {
+          h2 = pt->Bg->X[i][0] + (pt->sind[i] + k) * dd;
+          pt->coef[i * ordn + j] *= (pt->lpox[i] - h2) / (h1 - h2);
+        }
+      }
+    }
+  }
+  else
+  {
+    int actd = 1 - pt->dumyd;
+    double dd = pt->Bg->getdX(actd);
+    pt->sind[0] = int((pt->lpox[actd] - pt->Bg->X[actd][0]) / dd) - ordn / 2 + 1;
+    double h1, h2;
+    for (int j = 0; j < ordn; j++)
+    {
+      h1 = pt->Bg->X[actd][0] + (pt->sind[0] + j) * dd;
+      pt->coef[j] = 1;
+      for (int k = 0; k < j; k++)
+      {
+        h2 = pt->Bg->X[actd][0] + (pt->sind[0] + k) * dd;
+        pt->coef[j] *= (pt->lpox[actd] - h2) / (h1 - h2);
+      }
+      for (int k = j + 1; k < ordn; k++)
+      {
+        h2 = pt->Bg->X[actd][0] + (pt->sind[0] + k) * dd;
+        pt->coef[j] *= (pt->lpox[actd] - h2) / (h1 - h2);
+      }
+    }
+    pt->sind[2] = int((pt->lpox[2] - pt->Bg->X[2][0]) / pt->Bg->getdX(2) + 0.001);
+    if (!feq(pt->Bg->X[2][pt->sind[2]], pt->lpox[2], pt->Bg->getdX(2) / 2000))
+      cout << "error in ShellPatch::interdata_packer point = " << pt->lpox[2] << " != grid " << pt->Bg->X[2][pt->sind[2]] << endl;
+    pt->sind[1] = int((pt->lpox[pt->dumyd] - pt->Bg->X[pt->dumyd][0]) /
+                         pt->Bg->getdX(pt->dumyd) +
+                     0.001);
+    if (!feq(pt->Bg->X[pt->dumyd][pt->sind[1]], pt->lpox[pt->dumyd], pt->Bg->getdX(pt->dumyd) / 2000))
+      cout << "error in ShellPatch::interdata_packer for dumy dimension point = "
+           << pt->lpox[pt->dumyd] << " != grid " << pt->Bg->X[pt->dumyd][pt->sind[1]] << endl;
+  }
+}
+
+void shell_pack_one(double *data, int out_base, ShellPatch::pointstru *src,
+                    const std::vector<var *> &vars, int ordn, int Symmetry)
+{
+  shell_prepare_interp_coeffs(src, ordn);
+  const int DIMh = (src->dumyd == -1) ? dim : 1;
+  for (size_t vi = 0; vi < vars.size(); ++vi)
+  {
+    double &out = data[out_base + (int)vi];
+    bool fast_done = false;
+    if (shell_fast_interp_enabled())
+    {
+      if (DIMh == 3)
+        fast_done = shell_interp3d_fast(src, vars[vi], out, ordn);
+      else if (DIMh == 2)
+        fast_done = shell_interp2d_fast(src, vars[vi], out, ordn);
+      else if (DIMh == 1)
+        fast_done = shell_interp1d_fast(src, vars[vi], out, ordn);
+    }
+    if (fast_done)
+    {
+      shell_note_pack(DIMh, true);
+      continue;
+    }
+    shell_note_pack(DIMh, false);
+    switch (DIMh)
+    {
+    case 3:
+      f_global_interpind(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2],
+                         src->Bg->fgfs[vars[vi]->sgfn], out,
+                         src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry,
+                         src->sind, src->coef, src->ssst);
+      break;
+    case 2:
+      f_global_interpind2d(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2],
+                           src->Bg->fgfs[vars[vi]->sgfn], out,
+                           src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry,
+                           src->sind, src->coef, src->ssst);
+      break;
+    case 1:
+      f_global_interpind1d(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2],
+                           src->Bg->fgfs[vars[vi]->sgfn], out,
+                           src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry,
+                           src->sind, src->coef, src->ssst, src->dumyd);
+      break;
+    default:
+      cout << "ShellPatch::interdata_packer: not recognized DIM = " << DIMh << endl;
+      MPI_Abort(MPI_COMM_WORLD, 1);
+    }
+  }
+}
+
+void shell_unpack_one(double *data, int out_base, ShellPatch::pointstru *dst,
+                      const std::vector<var *> &vars)
+{
+  int DIM = dim;
+  for (size_t vi = 0; vi < vars.size(); ++vi)
+    f_pointcopy(DIM, dst->Bg->bbox, dst->Bg->bbox + dim, dst->Bg->shape,
+                dst->Bg->fgfs[vars[vi]->sgfn],
+                dst->lpox[0], dst->lpox[1], dst->lpox[2],
+                data[out_base + (int)vi]);
+}
+
+}
 
 ss_patch::ss_patch(int ingfsi, int fngfsi, int *shapei, double *bboxi, int myranki) : ingfs(ingfsi), fngfs(fngfsi), myrank(myranki), blb(0), ble(0)
 {
@@ -2544,17 +3491,32 @@ double *ShellPatch::Collect_Data(ss_patch *PP, var *VP)
   return databuffer;
 }
 
-void ShellPatch::intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
-                               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
-                               int Symmetry)
-{
+void ShellPatch::intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
+                               MyList<var> *VarList1 /* source */, MyList<var> *VarList2 /*target */,
+                               int Symmetry)
+{
   int myrank, cpusize;
   MPI_Comm_size(MPI_COMM_WORLD, &cpusize);
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
 
-  int node;
-
-  MPI_Request *reqs;
+  int node;
+  const bool transfer_timing = shell_transfer_timing_enabled();
+  double t_query = 0.0;
+  double t_pack = 0.0;
+  double t_post = 0.0;
+  double t_wait = 0.0;
+  double t_unpack = 0.0;
+  long long send_values = 0;
+  long long recv_values = 0;
+  double tt = 0.0;
+
+#if USE_CUDA_BSSN
+  const bool use_shell_cuda_pack = shell_cuda_interp_enabled();
+  if (use_shell_cuda_pack)
+    bssn_cuda_shell_pack_cache_begin();
+#endif
+
+  MPI_Request *reqs;
   MPI_Status *stats;
   reqs = new MPI_Request[2 * cpusize];
   stats = new MPI_Status[2 * cpusize];
@@ -2568,55 +3530,109 @@ void ShellPatch::intertransfer(MyList<pointstru> **src, MyList<pointstru> **dst,
   for (node = 0; node < cpusize; node++)
   {
     send_data[node] = rec_data[node] = 0;
-    if (node == myrank)
-    {
-      if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
-      {
-        rec_data[node] = new double[length];
-        if (!rec_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 1" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-      }
-    }
-    else
-    {
-      // send from this cpu to cpu#node
-      if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
-      {
-        send_data[node] = new double[length];
-        if (!send_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 2" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
-        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
-      }
-      // receive from cpu#node to this cpu
-      if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
-      {
-        rec_data[node] = new double[length];
-        if (!rec_data[node])
-        {
-          cout << "out of memory when new in short transfer, place 3" << endl;
-          MPI_Abort(MPI_COMM_WORLD, 1);
-        }
-        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
-      }
-    }
-  }
-  // wait for all requests to complete
-  MPI_Waitall(req_no, reqs, stats);
-
-  for (node = 0; node < cpusize; node++)
-    if (rec_data[node])
-      interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
-
-  for (node = 0; node < cpusize; node++)
-  {
+    if (node == myrank)
+    {
+      if (transfer_timing) tt = MPI_Wtime();
+      if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      {
+        if (transfer_timing) t_query += MPI_Wtime() - tt;
+        rec_data[node] = new double[length];
+        if (!rec_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 1" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        recv_values += length;
+        if (transfer_timing) tt = MPI_Wtime();
+        interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        if (transfer_timing) t_pack += MPI_Wtime() - tt;
+      }
+      else if (transfer_timing)
+        t_query += MPI_Wtime() - tt;
+    }
+    else
+    {
+      // send from this cpu to cpu#node
+      if (transfer_timing) tt = MPI_Wtime();
+      if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry))
+      {
+        if (transfer_timing) t_query += MPI_Wtime() - tt;
+        send_data[node] = new double[length];
+        if (!send_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 2" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        send_values += length;
+        if (transfer_timing) tt = MPI_Wtime();
+        interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry);
+        if (transfer_timing) t_pack += MPI_Wtime() - tt;
+        if (transfer_timing) tt = MPI_Wtime();
+        MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
+        if (transfer_timing) t_post += MPI_Wtime() - tt;
+      }
+      else if (transfer_timing)
+        t_query += MPI_Wtime() - tt;
+      // receive from cpu#node to this cpu
+      if (transfer_timing) tt = MPI_Wtime();
+      if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry))
+      {
+        if (transfer_timing) t_query += MPI_Wtime() - tt;
+        rec_data[node] = new double[length];
+        if (!rec_data[node])
+        {
+          cout << "out of memory when new in short transfer, place 3" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        recv_values += length;
+        if (transfer_timing) tt = MPI_Wtime();
+        MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++);
+        if (transfer_timing) t_post += MPI_Wtime() - tt;
+      }
+      else if (transfer_timing)
+        t_query += MPI_Wtime() - tt;
+    }
+  }
+  // wait for all requests to complete
+  if (transfer_timing) tt = MPI_Wtime();
+  MPI_Waitall(req_no, reqs, stats);
+  if (transfer_timing) t_wait += MPI_Wtime() - tt;
+
+#if USE_CUDA_BSSN
+  if (use_shell_cuda_pack)
+    bssn_cuda_shell_pack_cache_end();
+#endif
+
+  for (node = 0; node < cpusize; node++)
+    if (rec_data[node])
+    {
+      if (transfer_timing) tt = MPI_Wtime();
+      interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry);
+      if (transfer_timing) t_unpack += MPI_Wtime() - tt;
+    }
+
+  if (transfer_timing)
+  {
+    double local[5] = {t_query, t_pack, t_post, t_wait, t_unpack};
+    double maxv[5] = {};
+    long long counts[2] = {send_values, recv_values};
+    long long max_counts[2] = {};
+    MPI_Reduce(local, maxv, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    MPI_Reduce(counts, max_counts, 2, MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD);
+    if (myrank == 0)
+      cout << "[AMSS-SHELL-TRANSFER] vars=" << shell_var_count(VarList1)
+           << " query=" << maxv[0]
+           << " pack=" << maxv[1]
+           << " post=" << maxv[2]
+           << " wait=" << maxv[3]
+           << " unpack=" << maxv[4]
+           << " send_values=" << max_counts[0]
+           << " recv_values=" << max_counts[1]
+           << endl;
+  }
+
+  for (node = 0; node < cpusize; node++)
+  {
     if (send_data[node])
       delete[] send_data[node];
     if (rec_data[node])
@@ -2660,14 +3676,77 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
     varld = varld->next;
   }
 
-  if (varls || varld)
-  {
-    cout << "error in short data packer, var lists does not match." << endl;
-    MPI_Abort(MPI_COMM_WORLD, 1);
-  }
-
-  while (src && dst)
-  {
+  if (varls || varld)
+  {
+    cout << "error in short data packer, var lists does not match." << endl;
+    MPI_Abort(MPI_COMM_WORLD, 1);
+  }
+
+  if (!data || shell_parallel_interp_enabled() || shell_cuda_interp_enabled())
+  {
+    ShellActiveCache &active_cache = shell_get_active_cache(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn);
+    const std::vector<var *> &src_vars = active_cache.src_vars;
+    const std::vector<var *> &dst_vars = active_cache.dst_vars;
+    const std::vector<ShellPointPair> &active_points = active_cache.active_points;
+    const std::vector<char> &fast_points = active_cache.fast_points;
+    const std::vector<int> &dst_indices = active_cache.dst_indices;
+
+    if (!data)
+      return (int)(active_points.size() * src_vars.size());
+
+    if (active_points.size() * src_vars.size() >= 2048)
+    {
+      const int nthreads = std::min(shell_interp_threads(), (int)active_points.size());
+      const int nvar = (int)src_vars.size();
+
+      bool cuda_pack_done = false;
+      if (dir == PACK)
+        cuda_pack_done = shell_pack_cuda_fast_points(data, active_points, src_vars, fast_points, ordn);
+
+      std::vector<std::thread> workers;
+      workers.reserve(nthreads);
+      if (!cuda_pack_done)
+      {
+        for (int tid = 0; tid < nthreads; ++tid)
+        {
+          const int begin = (int)((long long)active_points.size() * tid / nthreads);
+          const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads);
+          workers.push_back(std::thread([&, begin, end]() {
+            for (int p = begin; p < end; ++p)
+            {
+              const int base = p * nvar;
+              if (!fast_points[p])
+                continue;
+              if (dir == PACK)
+                shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn);
+              else
+              {
+                for (int vi = 0; vi < nvar; ++vi)
+                  active_points[p].dst->Bg->fgfs[dst_vars[vi]->sgfn][dst_indices[p]] = data[base + vi];
+              }
+            }
+          }));
+        }
+        for (size_t t = 0; t < workers.size(); ++t)
+          workers[t].join();
+      }
+
+      for (size_t p = 0; p < active_points.size(); ++p)
+      {
+        if (fast_points[p])
+          continue;
+        const int base = (int)p * nvar;
+        if (dir == PACK)
+          shell_pack_one(data, base, active_points[p].src, src_vars, ordn, Symmetry);
+        else
+          shell_unpack_one(data, base, active_points[p].dst, dst_vars);
+      }
+      return (int)(active_points.size() * src_vars.size());
+    }
+  }
+
+  while (src && dst)
+  {
     if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
         (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
     {
@@ -2746,11 +3825,23 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
                        << src->data->lpox[src->data->dumyd] << " != grid " << src->data->Bg->X[src->data->dumyd][src->data->sind[1]] << endl;
               }
             }
-            // interpolate
-            switch (DIMh)
-            {
-            case 3:
-              f_global_interpind(src->data->Bg->shape, src->data->Bg->X[0], src->data->Bg->X[1], src->data->Bg->X[2],
+            // interpolate
+            bool fast_done = false;
+            if (shell_fast_interp_enabled())
+            {
+              if (DIMh == 3)
+                fast_done = shell_interp3d_fast(src->data, varls->data, data[size_out], ordn);
+              else if (DIMh == 2)
+                fast_done = shell_interp2d_fast(src->data, varls->data, data[size_out], ordn);
+              else if (DIMh == 1)
+                fast_done = shell_interp1d_fast(src->data, varls->data, data[size_out], ordn);
+            }
+            shell_note_pack(DIMh, fast_done);
+            if (!fast_done)
+            switch (DIMh)
+            {
+            case 3:
+              f_global_interpind(src->data->Bg->shape, src->data->Bg->X[0], src->data->Bg->X[1], src->data->Bg->X[2],
                                  src->data->Bg->fgfs[varls->data->sgfn], data[size_out],
                                  src->data->lpox[0], src->data->lpox[1], src->data->lpox[2], ordn, varls->data->SoA, Symmetry,
                                  src->data->sind, src->data->coef, src->data->ssst);
@@ -2771,11 +3862,13 @@ int ShellPatch::interdata_packer(double *data, MyList<pointstru> *src, MyList<po
               cout << "ShellPatch::interdata_packer: not recognized DIM = " << DIMh << endl;
               MPI_Abort(MPI_COMM_WORLD, 1);
             }
-          }
-          if (dir == UNPACK) // from target data to corresponding grid
-            f_pointcopy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
-                        dst->data->lpox[0], dst->data->lpox[1], dst->data->lpox[2], data[size_out]);
-        }
+          }
+          if (dir == UNPACK) // from target data to corresponding grid
+          {
+            f_pointcopy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
+                        dst->data->lpox[0], dst->data->lpox[1], dst->data->lpox[2], data[size_out]);
+          }
+        }
         size_out += 1;
         varls = varls->next;
         varld = varld->next;
@@ -2795,17 +3888,19 @@ void ShellPatch::Synch(MyList<var> *VarList, int Symmetry)
     Pp->data->Sync(VarList, Symmetry);
     Pp = Pp->next;
   }
-
-  intertransfer(ss_src, ss_dst, VarList, VarList, Symmetry);
-}
-
-void ShellPatch::CS_Inter(MyList<var> *VarList, int Symmetry)
-{
+
+  intertransfer(ss_src, ss_dst, VarList, VarList, Symmetry);
+  shell_print_interp_stats("after Synch");
+}
+
+void ShellPatch::CS_Inter(MyList<var> *VarList, int Symmetry)
+{
   // fill shell first
-  intertransfer(csats_src, csats_dst, VarList, VarList, Symmetry);
-  // fill box then
-  intertransfer(csatc_src, csatc_dst, VarList, VarList, Symmetry);
-}
+  intertransfer(csats_src, csats_dst, VarList, VarList, Symmetry);
+  // fill box then
+  intertransfer(csatc_src, csatc_dst, VarList, VarList, Symmetry);
+  shell_print_interp_stats("after CS_Inter");
+}
 
 void ShellPatch::check_pointstrul(MyList<pointstru> *pp, bool first_only)
 {