Fix GPU RK4 boundary and sync correctness

2026-04-12 12:13:47 +08:00
parent b78874ef21
commit d9287ea530
4 changed files with 134 additions and 30 deletions
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -83,6 +83,9 @@ struct CachedInterpPlanEntry
 {
  bool valid;
  InterpPlanKey key;
+  vector<double> xvals;
+  vector<double> yvals;
+  vector<double> zvals;
  CachedInterpPlan plan;

  CachedInterpPlanEntry() : valid(false) {}
@@ -286,14 +289,28 @@ CachedInterpPlanEntry &interp_plan_cache_entry()
 bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
 {
  return lhs.patch == rhs.patch &&
-         lhs.x == rhs.x &&
-         lhs.y == rhs.y &&
-         lhs.z == rhs.z &&
         lhs.NN == rhs.NN &&
         lhs.Symmetry == rhs.Symmetry &&
         lhs.myrank == rhs.myrank;
 }

+bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
+{
+  if (static_cast<int>(cache.xvals.size()) != NN ||
+      static_cast<int>(cache.yvals.size()) != NN ||
+      static_cast<int>(cache.zvals.size()) != NN)
+    return false;
+
+  for (int j = 0; j < NN; ++j)
+  {
+    if (cache.xvals[j] != XX[0][j] ||
+        cache.yvals[j] != XX[1][j] ||
+        cache.zvals[j] != XX[2][j])
+      return false;
+  }
+  return true;
+}
+
 CachedInterpPlan &get_cached_interp_plan(Patch *patch,
                                         int NN, double **XX,
                                         int Symmetry, int myrank,
@@ -314,11 +331,15 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
  if (cache.valid &&
      same_interp_plan_key(cache.key, key) &&
+      same_interp_plan_points(cache, NN, XX) &&
      cache.plan.nblocks == static_cast<int>(block_index.views.size()))
    return cache.plan;

  cache.valid = true;
  cache.key = key;
+  cache.xvals.assign(XX[0], XX[0] + NN);
+  cache.yvals.assign(XX[1], XX[1] + NN);
+  cache.zvals.assign(XX[2], XX[2] + NN);
  cache.plan = CachedInterpPlan();
  CachedInterpPlan &plan = cache.plan;
  plan.nblocks = static_cast<int>(block_index.views.size());
@@ -412,6 +433,9 @@ void release_interp_plan_cache_internal()
 {
  CachedInterpPlanEntry &cache = interp_plan_cache_entry();
  cache.valid = false;
+  cache.xvals.clear();
+  cache.yvals.clear();
+  cache.zvals.clear();
  cache.plan = CachedInterpPlan();
 }

--- a/AMSS_NCKU_source/bssn_cuda_ops.cu
+++ b/AMSS_NCKU_source/bssn_cuda_ops.cu
@@ -1,5 +1,7 @@
 #include "bssn_cuda_ops.h"
 #include "bssn_gpu.h"
+#include "rungekutta4_rout.h"
+#include "sommerfeld_rout.h"

 #include <cmath>
 #include <cstdio>
@@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
  return true;
 }

+inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
+{
+  const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
+  if (!device_ptr)
+    return true;
+
+  bssn_gpu_prepare_host_buffer(host_ptr, count);
+  const size_t bytes = static_cast<size_t>(count) * sizeof(double);
+  cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
+  if (err != cudaSuccess)
+  {
+    report_cuda_error(label, err);
+    return false;
+  }
+  return true;
+}
+
 inline bool copy_region_to_padded_stage(CachedBuffer &dst,
                                        const double *src,
                                        const int src_shape[3],
@@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
+                               const double *phi_field,
+                               const double *lap_field,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
@@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
    ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
  }

-  if (ok)
+  if (ok && lev == 0)
  {
    double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
    void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
@@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,

  if (ok && lev > 0)
  {
-      double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
-      void *args[] = {&nx, &ny, &nz,
-                      &has_xmin, &has_ymin, &has_zmin,
-                      &has_xmax, &has_ymax, &has_zmax,
-                      &d_state0, &d_stage};
-    ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
+    double *host_state0 = const_cast<double *>(state0);
+    double *host_phi = const_cast<double *>(phi_field);
+    double *host_lap = const_cast<double *>(lap_field);
+    double *host_rhs = rhs_accum;
+
+    ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
+         sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
+         sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
+         sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
+    if (ok)
+    {
+      bssn_gpu_prepare_host_buffer(stage_data, n);
+      cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(D2H) stage_data", err);
+        ok = false;
+      }
+    }
+
+    if (ok)
+    {
+      int rk_stage_host = rk_stage;
+      int cor = 1;
+      f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
+      f_sommerfeld_rout(ex,
+                        const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
+                        xmin, ymin, zmin, xmax, ymax, zmax,
+                        dT,
+                        host_phi, host_lap,
+                        host_state0, stage_data,
+                        const_cast<double *>(SoA),
+                        symmetry, cor);
+
+      cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
+      if (err != cudaSuccess)
+      {
+        report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
+        ok = false;
+      }
+    }
  }

  if (ok)
--- a/AMSS_NCKU_source/bssn_cuda_ops.h
+++ b/AMSS_NCKU_source/bssn_cuda_ops.h
@@ -12,6 +12,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
                               double xmin, double ymin, double zmin,
                               double xmax, double ymax, double zmax,
                               const double *state0,
+                               const double *phi_field,
+                               const double *lap_field,
                               const double *boundary_src,
                               double *stage_data,
                               double *rhs_accum,
--- a/AMSS_NCKU_source/bssn_cuda_step.C
+++ b/AMSS_NCKU_source/bssn_cuda_step.C
@@ -2,14 +2,15 @@

 #ifdef USE_GPU

+#include <algorithm>
 #include <cmath>
+#include <cstdlib>
 #include <vector>

 #include "bssn_class.h"
 #include "bssn_cuda_ops.h"
 #include "bssn_gpu.h"
 #include "bssn_macro.h"
-#include "rungekutta4_rout.h"

 void bssn_class::Step_MainPath_GPU(int lev, int YN)
 {
@@ -56,11 +57,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)

  const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
  (void)BB;
-#if (MAPBH == 0)
-  const bool need_host_stage_sync = (BH_num > 0 && lev == GH->levels - 1);
-#else
-  const bool need_host_stage_sync = false;
-#endif
  double ndeps = (lev < GH->movls) ? numepsb : numepss;
  double TRK4 = PhysTime;
  int iter_count = 0;
@@ -83,6 +79,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                                         patch->bbox[0], patch->bbox[1], patch->bbox[2],
                                         patch->bbox[3], patch->bbox[4], patch->bbox[5],
                                         cg->fgfs[varl0->data->sgfn],
+                                         cg->fgfs[phi0->sgfn],
+                                         cg->fgfs[Lap0->sgfn],
                                         cg->fgfs[varlb->data->sgfn],
                                         cg->fgfs[varls->data->sgfn],
                                         cg->fgfs[varlr->data->sgfn],
@@ -124,6 +122,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
        }
      };

+  auto stage_download_patch_list =
+      [&](MyList<var> *var_list) {
+        MyList<Patch> *patch_it = GH->PatL[lev];
+        while (patch_it)
+        {
+          MyList<Block> *block_it = patch_it->data->blb;
+          while (block_it)
+          {
+            Block *cg = block_it->data;
+            if (myrank == cg->rank)
+              stage_download_var_list(cg, var_list);
+
+            if (block_it == patch_it->data->ble)
+              break;
+            block_it = block_it->next;
+          }
+          if (ERROR)
+            break;
+          patch_it = patch_it->next;
+        }
+      };
+
  auto ensure_stage_device_var_list =
      [&](Block *cg, MyList<var> *var_list) {
        const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
@@ -336,8 +356,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
               << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
          ERROR = 1;
        }
-        if (!ERROR && !sync_cache_pre[lev].valid)
-          stage_download_var_list(cg, SynchList_pre);
      }
      if (BP == Pp->data->ble)
        break;
@@ -346,8 +364,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
    Pp = Pp->next;
  }

-  if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev]))
-    refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
+  if (!ERROR)
+  {
+    stage_download_patch_list(SynchList_pre);
+    if (!ERROR)
+      bssn_gpu_clear_cached_device_buffers();
+  }

  MPI_Request err_req_pre;
  {
@@ -357,8 +379,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)

  Parallel::AsyncSyncState async_pre;
  Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
-  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, need_host_stage_sync);
-  if (!ERROR && need_host_stage_sync)
+  Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, true);
+  if (!ERROR)
    refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);

  MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
@@ -427,8 +449,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
                 << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
            ERROR = 1;
          }
-          if (!ERROR && (!sync_cache_cor[lev].valid || iter_count == 3))
-            stage_download_var_list(cg, SynchList_cor);
        }

        if (BP == Pp->data->ble)
@@ -438,9 +458,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
      Pp = Pp->next;
    }

-    if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 &&
-        !can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev]))
-      refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
+    if (!ERROR)
+    {
+      stage_download_patch_list(SynchList_cor);
+      if (!ERROR)
+        bssn_gpu_clear_cached_device_buffers();
+    }

    MPI_Request err_req_cor;
    {
@@ -450,9 +473,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)

    Parallel::AsyncSyncState async_cor;
    Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
-    const bool unpack_cor_to_host = (iter_count == 3) || need_host_stage_sync;
-    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, unpack_cor_to_host);
-    if (!ERROR && iter_count < 3 && unpack_cor_to_host)
+    Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, true);
+    if (!ERROR && iter_count < 3)
      refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);

    MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);