From d9287ea530df4e98d9354b4485e859b96c757823 Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Sun, 12 Apr 2026 12:13:47 +0800
Subject: [PATCH] Fix GPU RK4 boundary and sync correctness

---
 AMSS_NCKU_source/MPatch.C         | 30 +++++++++++--
 AMSS_NCKU_source/bssn_cuda_ops.cu | 70 +++++++++++++++++++++++++++----
 AMSS_NCKU_source/bssn_cuda_ops.h  |  2 +
 AMSS_NCKU_source/bssn_cuda_step.C | 62 ++++++++++++++++++---------
 4 files changed, 134 insertions(+), 30 deletions(-)
diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C
index 29680d8..7f7242a 100644
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -83,6 +83,9 @@ struct CachedInterpPlanEntry
 {
   bool valid;
   InterpPlanKey key;
+  vector<double> xvals;
+  vector<double> yvals;
+  vector<double> zvals;
   CachedInterpPlan plan;
 
   CachedInterpPlanEntry() : valid(false) {}
@@ -286,14 +289,28 @@ CachedInterpPlanEntry &interp_plan_cache_entry()
 bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
 {
   return lhs.patch == rhs.patch &&
-         lhs.x == rhs.x &&
-         lhs.y == rhs.y &&
-         lhs.z == rhs.z &&
          lhs.NN == rhs.NN &&
          lhs.Symmetry == rhs.Symmetry &&
          lhs.myrank == rhs.myrank;
 }
 
+bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
+{
+  if (static_cast<int>(cache.xvals.size()) != NN ||
+      static_cast<int>(cache.yvals.size()) != NN ||
+      static_cast<int>(cache.zvals.size()) != NN)
+    return false;
+
+  for (int j = 0; j < NN; ++j)
+  {
+    if (cache.xvals[j] != XX[0][j] ||
+        cache.yvals[j] != XX[1][j] ||
+        cache.zvals[j] != XX[2][j])
+      return false;
+  }
+  return true;
+}
+
 CachedInterpPlan &get_cached_interp_plan(Patch *patch,
                                          int NN, double **XX,
                                          int Symmetry, int myrank,
@@ -314,11 +331,15 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
   CachedInterpPlanEntry &cache = interp_plan_cache_entry();
   if (cache.valid &&
       same_interp_plan_key(cache.key, key) &&
+      same_interp_plan_points(cache, NN, XX) &&
       cache.plan.nblocks == static_cast<int>(block_index.views.size()))
     return cache.plan;
 
   cache.valid = true;
   cache.key = key;
+  cache.xvals.assign(XX[0], XX[0] + NN);
+  cache.yvals.assign(XX[1], XX[1] + NN);
+  cache.zvals.assign(XX[2], XX[2] + NN);
   cache.plan = CachedInterpPlan();
   CachedInterpPlan &plan = cache.plan;
   plan.nblocks = static_cast<int>(block_index.views.size());
@@ -412,6 +433,9 @@ void release_interp_plan_cache_internal()
 {
   CachedInterpPlanEntry &cache = interp_plan_cache_entry();
   cache.valid = false;
+  cache.xvals.clear();
+  cache.yvals.clear();
+  cache.zvals.clear();
   cache.plan = CachedInterpPlan();
 }
 
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu
index fa4e9cf..f61ddf4 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -1,5 +1,7 @@
 #include "bssn_cuda_ops.h"
 #include "bssn_gpu.h"
+#include "rungekutta4_rout.h"
+#include "sommerfeld_rout.h"
 
 #include <cmath>
 #include <cstdio>
@@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
   return true;
 }
 
+inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
+{
+  const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
+  if (!device_ptr)
+    return true;
+
+  bssn_gpu_prepare_host_buffer(host_ptr, count);
+  const size_t bytes = static_cast<size_t>(count) * sizeof(double);
+  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error(label, err);
+    return false;
+  }
+  return true;
+}
+
 inline bool copy_region_to_padded_stage(CachedBuffer &dst,
                                         const double *src,
                                         const int src_shape[3],
@@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                                double xmin, double ymin, double zmin,
                                double xmax, double ymax, double zmax,
                                const double *state0,
+                               const double *phi_field,
+                               const double *lap_field,
                                const double *boundary_src,
                                double *stage_data,
                                double *rhs_accum,
@@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
     ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
   }
 
-  if (ok)
+  if (ok && lev == 0)
   {
     double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
     void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
@@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
 
   if (ok && lev > 0)
   {
-      double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
-      void *args[] = {&nx, &ny, &nz,
-                      &has_xmin, &has_ymin, &has_zmin,
-                      &has_xmax, &has_ymax, &has_zmax,
-                      &d_state0, &d_stage};
-    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
+    double *host_state0 = const_cast<double *>(state0);
+    double *host_phi = const_cast<double *>(phi_field);
+    double *host_lap = const_cast<double *>(lap_field);
+    double *host_rhs = rhs_accum;
+
+    ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
+         sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
+         sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
+         sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
+    if (ok)
+    {
+      bssn_gpu_prepare_host_buffer(stage_data, n);
+      cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(D2H) stage_data", err);
+        ok = false;
+      }
+    }
+
+    if (ok)
+    {
+      int rk_stage_host = rk_stage;
+      int cor = 1;
+      f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
+      f_sommerfeld_rout(ex,
+                        const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
+                        xmin, ymin, zmin, xmax, ymax, zmax,
+                        dT,
+                        host_phi, host_lap,
+                        host_state0, stage_data,
+                        const_cast<double *>(SoA),
+                        symmetry, cor);
+
+      cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
+        ok = false;
+      }
+    }
   }
 
   if (ok)
diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h
index ac2dfab..5d86f18 100644
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -12,6 +12,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                                double xmin, double ymin, double zmin,
                                double xmax, double ymax, double zmax,
                                const double *state0,
+                               const double *phi_field,
+                               const double *lap_field,
                                const double *boundary_src,
                                double *stage_data,
                                double *rhs_accum,
diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C
index c5ea4c6..6ee1fec 100644
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -2,14 +2,15 @@
 
 #ifdef USE_GPU
 
+#include <algorithm>
 #include <cmath>
+#include <cstdlib>
 #include <vector>
 
 #include "bssn_class.h"
 #include "bssn_cuda_ops.h"
 #include "bssn_gpu.h"
 #include "bssn_macro.h"
-#include "rungekutta4_rout.h"
 
 void bssn_class::Step_MainPath_GPU(int lev, int YN)
 {
@@ -56,11 +57,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
 
   const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
   (void)BB;
-#if (MAPBH == 0)
-  const bool need_host_stage_sync = (BH_num > 0 && lev == GH->levels - 1);
-#else
-  const bool need_host_stage_sync = false;
-#endif
   double ndeps = (lev < GH->movls) ? numepsb : numepss;
   double TRK4 = PhysTime;
   int iter_count = 0;
@@ -83,6 +79,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                                          patch->bbox[0], patch->bbox[1], patch->bbox[2],
                                          patch->bbox[3], patch->bbox[4], patch->bbox[5],
                                          cg->fgfs[varl0->data->sgfn],
+                                         cg->fgfs[phi0->sgfn],
+                                         cg->fgfs[Lap0->sgfn],
                                          cg->fgfs[varlb->data->sgfn],
                                          cg->fgfs[varls->data->sgfn],
                                          cg->fgfs[varlr->data->sgfn],
@@ -124,6 +122,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
         }
       };
 
+  auto stage_download_patch_list =
+      [&](MyList<var> *var_list) {
+        MyList<Patch> *patch_it = GH->PatL[lev];
+        while (patch_it)
+        {
+          MyList<Block> *block_it = patch_it->data->blb;
+          while (block_it)
+          {
+            Block *cg = block_it->data;
+            if (myrank == cg->rank)
+              stage_download_var_list(cg, var_list);
+
+            if (block_it == patch_it->data->ble)
+              break;
+            block_it = block_it->next;
+          }
+          if (ERROR)
+            break;
+          patch_it = patch_it->next;
+        }
+      };
+
   auto ensure_stage_device_var_list =
       [&](Block *cg, MyList<var> *var_list) {
         const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
@@ -336,8 +356,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
           ERROR = 1;
         }
-        if (!ERROR && !sync_cache_pre[lev].valid)
-          stage_download_var_list(cg, SynchList_pre);
       }
       if (BP == Pp->data->ble)
         break;
@@ -346,8 +364,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
     Pp = Pp->next;
   }
 
-  if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev]))
-    refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
+  if (!ERROR)
+  {
+    stage_download_patch_list(SynchList_pre);
+    if (!ERROR)
+      bssn_gpu_clear_cached_device_buffers();
+  }
 
   MPI_Request err_req_pre;
   {
@@ -357,8 +379,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
 
   Parallel::AsyncSyncState async_pre;
   Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, need_host_stage_sync);
-  if (!ERROR && need_host_stage_sync)
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, true);
+  if (!ERROR)
     refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
 
   MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
@@ -427,8 +449,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                  << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
             ERROR = 1;
           }
-          if (!ERROR && (!sync_cache_cor[lev].valid || iter_count == 3))
-            stage_download_var_list(cg, SynchList_cor);
         }
 
         if (BP == Pp->data->ble)
@@ -438,9 +458,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
       Pp = Pp->next;
     }
 
-    if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 &&
-        !can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev]))
-      refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
+    if (!ERROR)
+    {
+      stage_download_patch_list(SynchList_cor);
+      if (!ERROR)
+        bssn_gpu_clear_cached_device_buffers();
+    }
 
     MPI_Request err_req_cor;
     {
@@ -450,9 +473,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
 
     Parallel::AsyncSyncState async_cor;
     Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-    const bool unpack_cor_to_host = (iter_count == 3) || need_host_stage_sync;
-    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, unpack_cor_to_host);
-    if (!ERROR && iter_count < 3 && unpack_cor_to_host)
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, true);
+    if (!ERROR && iter_count < 3)
       refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
 
     MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);