Add optional CUDA surface interpolation

2026-04-30 19:21:19 +08:00
parent 6835608f92
commit be9033f449
3 changed files with 270 additions and 6 deletions
--- a/AMSS_NCKU_source/MPatch.C
+++ b/AMSS_NCKU_source/MPatch.C
@@ -11,12 +11,15 @@
 using namespace std;

 #include "misc.h"
-#include "MPatch.h"
-#include "Parallel.h"
-#include "fmisc.h"
-#ifdef INTERP_LB_PROFILE
-#include "interp_lb_profile.h"
-#endif
+#include "MPatch.h"
+#include "Parallel.h"
+#include "fmisc.h"
+#if USE_CUDA_BSSN
+#include "bssn_rhs_cuda.h"
+#endif
+#ifdef INTERP_LB_PROFILE
+#include "interp_lb_profile.h"
+#endif

 namespace
 {
@@ -195,6 +198,17 @@ bool interp_fast_enabled()
  return enabled != 0;
 }

+bool interp_gpu_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_INTERP_GPU");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
 bool interp_fast_compare_enabled()
 {
  static int enabled = -1;
@@ -1113,6 +1127,101 @@ void Patch::Interp_Points(MyList<var> *VarList,
  }

  // --- Interpolation phase (identical to original) ---
+#if USE_CUDA_BSSN
+  const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 &&
+                              VarList && VarList->next && !VarList->next->next;
+#else
+  const bool use_gpu_interp = false;
+#endif
+  if (use_gpu_interp)
+  {
+#if USE_CUDA_BSSN
+    vector<vector<int> > local_points(block_index.views.size());
+    for (int j = 0; j < NN; j++)
+    {
+      for (int i = 0; i < dim; i++)
+      {
+        if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
+        {
+          cout << "Patch::Interp_Points: point (";
+          for (int k = 0; k < dim; k++)
+          {
+            cout << XX[k][j];
+            if (k < dim - 1)
+              cout << ",";
+            else
+              cout << ") is out of current Patch." << endl;
+          }
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+      }
+
+      CachedInterpPoint &cp = surface_cache->points[j];
+      Block *BP = cp.bp;
+      owner_rank[j] = cp.owner_rank;
+      if (BP && myrank == BP->rank)
+      {
+        for (size_t bi = 0; bi < block_index.views.size(); bi++)
+        {
+          if (block_index.views[bi].bp == BP)
+          {
+            local_points[bi].push_back(j);
+            break;
+          }
+        }
+      }
+    }
+
+    var *v0 = VarList->data;
+    var *v1 = VarList->next->data;
+    double soa6[6] = {
+        v0->SoA[0], v0->SoA[1], v0->SoA[2],
+        v1->SoA[0], v1->SoA[1], v1->SoA[2]};
+
+    for (size_t bi = 0; bi < local_points.size(); bi++)
+    {
+      const int count = int(local_points[bi].size());
+      if (count <= 0)
+        continue;
+
+      Block *BP = block_index.views[bi].bp;
+      vector<double> px(count), py(count), pz(count), out(2 * count);
+      for (int q = 0; q < count; q++)
+      {
+        const int j = local_points[bi][q];
+        px[q] = XX[0][j];
+        py[q] = XX[1][j];
+        pz[q] = XX[2][j];
+      }
+
+      const double dx = BP->X[0][1] - BP->X[0][0];
+      const double dy = BP->X[1][1] - BP->X[1][0];
+      const double dz = BP->X[2][1] - BP->X[2][0];
+      const int ok = bssn_cuda_interp_host_two_fields(
+          BP, BP->shape,
+          BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn],
+          BP->X[0][0], BP->X[1][0], BP->X[2][0],
+          dx, dy, dz,
+          &px[0], &py[0], &pz[0], count,
+          ordn, Symmetry, soa6, &out[0]);
+      if (ok != 0)
+      {
+        if (myrank == 0)
+          cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl;
+        MPI_Abort(MPI_COMM_WORLD, 1);
+      }
+
+      for (int q = 0; q < count; q++)
+      {
+        const int j = local_points[bi][q];
+        Shellf[j * num_var] = out[2 * q];
+        Shellf[j * num_var + 1] = out[2 * q + 1];
+      }
+    }
+#endif
+  }
+  else
+  {
  for (int j = 0; j < NN; j++)
  {
    double pox[dim];
@@ -1183,6 +1292,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
      }
    }
  }
+  }

 #ifdef INTERP_LB_PROFILE
  double t_interp_end = MPI_Wtime();
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -5765,6 +5765,65 @@ __global__ void kern_interp_state_point3(const double * __restrict__ mem,
    out[f] = value;
 }

+__global__ void kern_interp_host_two_fields(const double * __restrict__ field0,
+                                            const double * __restrict__ field1,
+                                            const double * __restrict__ px,
+                                            const double * __restrict__ py,
+                                            const double * __restrict__ pz,
+                                            double * __restrict__ out,
+                                            int nx,
+                                            int ny,
+                                            int nz,
+                                            int all,
+                                            double x0,
+                                            double y0,
+                                            double z0,
+                                            double dx,
+                                            double dy,
+                                            double dz,
+                                            int npoints,
+                                            int ordn,
+                                            int symmetry,
+                                            double soa00, double soa01, double soa02,
+                                            double soa10, double soa11, double soa12)
+{
+    const int p = blockIdx.x * blockDim.x + threadIdx.x;
+    if (p >= npoints || ordn <= 0 || ordn > 8)
+        return;
+
+    int ib, jb, kb;
+    double tx, ty, tz;
+    interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx);
+    interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty);
+    interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz);
+
+    double wx[8], wy[8], wz[8];
+    for (int i = 0; i < ordn; ++i) {
+        wx[i] = interp_lagrange_weight(i, tx, ordn);
+        wy[i] = interp_lagrange_weight(i, ty, ordn);
+        wz[i] = interp_lagrange_weight(i, tz, ordn);
+    }
+
+    double v0 = 0.0;
+    double v1 = 0.0;
+    const double soa0[3] = {soa00, soa01, soa02};
+    const double soa1[3] = {soa10, soa11, soa12};
+    for (int k = 0; k < ordn; ++k) {
+        for (int j = 0; j < ordn; ++j) {
+            const double wyz = wy[j] * wz[k];
+            for (int i = 0; i < ordn; ++i) {
+                const double coeff = wx[i] * wyz;
+                v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0,
+                                                ib + i, jb + j, kb + k, soa0);
+                v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0,
+                                                ib + i, jb + j, kb + k, soa1);
+            }
+        }
+    }
+    out[2 * p] = v0;
+    out[2 * p + 1] = v1;
+}
+
 __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
                                             double * __restrict__ dst,
                                             int nx, int ny,
@@ -7076,6 +7135,82 @@ int bssn_cuda_interp_state_point3(void *block_tag,
    return 0;
 }

+extern "C"
+int bssn_cuda_interp_host_two_fields(void *block_tag,
+                                     int *ex,
+                                     double *field0,
+                                     double *field1,
+                                     double x0,
+                                     double y0,
+                                     double z0,
+                                     double dx,
+                                     double dy,
+                                     double dz,
+                                     const double *px,
+                                     const double *py,
+                                     const double *pz,
+                                     int npoints,
+                                     int ordn,
+                                     int symmetry,
+                                     const double *soa6,
+                                     double *out_interleaved)
+{
+    (void)block_tag;
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 ||
+        !out_interleaved || npoints <= 0)
+        return 1;
+    if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
+        ordn <= 0 || ordn > 8 ||
+        ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
+        return 1;
+
+    const int all = ex[0] * ex[1] * ex[2];
+    const size_t field_bytes = (size_t)all * sizeof(double);
+    const size_t point_bytes = (size_t)npoints * sizeof(double);
+    const size_t out_bytes = (size_t)2 * npoints * sizeof(double);
+
+    double *d_field0 = nullptr;
+    double *d_field1 = nullptr;
+    double *d_px = nullptr;
+    double *d_py = nullptr;
+    double *d_pz = nullptr;
+    double *d_out = nullptr;
+    CUDA_CHECK(cudaMalloc(&d_field0, field_bytes));
+    CUDA_CHECK(cudaMalloc(&d_field1, field_bytes));
+    CUDA_CHECK(cudaMalloc(&d_px, point_bytes));
+    CUDA_CHECK(cudaMalloc(&d_py, point_bytes));
+    CUDA_CHECK(cudaMalloc(&d_pz, point_bytes));
+    CUDA_CHECK(cudaMalloc(&d_out, out_bytes));
+
+    CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice));
+
+    const int threads = 256;
+    const int blocks = (npoints + threads - 1) / threads;
+    kern_interp_host_two_fields<<<blocks, threads>>>(
+        d_field0, d_field1, d_px, d_py, d_pz, d_out,
+        ex[0], ex[1], ex[2], all,
+        x0, y0, z0, dx, dy, dz,
+        npoints, ordn, symmetry,
+        soa6[0], soa6[1], soa6[2],
+        soa6[3], soa6[4], soa6[5]);
+    CUDA_CHECK(cudaGetLastError());
+    CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost));
+
+    cudaFree(d_out);
+    cudaFree(d_pz);
+    cudaFree(d_py);
+    cudaFree(d_px);
+    cudaFree(d_field1);
+    cudaFree(d_field0);
+    return 0;
+}
+
 extern "C"
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -102,6 +102,25 @@ int bssn_cuda_interp_state_point3(void *block_tag,
                                  const double *soa3,
                                  double *out3);

+int bssn_cuda_interp_host_two_fields(void *block_tag,
+                                     int *ex,
+                                     double *field0,
+                                     double *field1,
+                                     double x0,
+                                     double y0,
+                                     double z0,
+                                     double dx,
+                                     double dy,
+                                     double dz,
+                                     const double *px,
+                                     const double *py,
+                                     const double *pz,
+                                     int npoints,
+                                     int ordn,
+                                     int symmetry,
+                                     const double *soa6,
+                                     double *out_interleaved);
+
 int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
                                                   int state_index,
                                                   double *host_buffer,