Add CUDA AMR restrict diagnostics

2026-04-30 12:20:44 +08:00
parent 8486532920
commit a6483d013d
1 changed files with 168 additions and 2 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -6,6 +6,7 @@
 #include "parameters.h"
 #include <cstdlib>
 #include <cstdio>
+#include <cmath>
 #include <vector>

 #ifndef USE_CUDA_Z4C
@@ -219,6 +220,12 @@ int fortran_idint(double x)

 bool cuda_amr_restrict_device_enabled();
 bool cuda_amr_prolong_device_enabled();
+bool cuda_amr_restrict_compare_enabled();
+bool cuda_amr_restrict_batch_enabled();
+bool cuda_device_segment_batch_enabled();
+bool cuda_download_resident_subset_to_host(Block *block,
+                                           MyList<var> *vars,
+                                           int state_count);

 bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src,
                                   const Parallel::gridseg *dst,
@@ -479,6 +486,61 @@ bool cuda_amr_prolong_device_enabled()
  return enabled != 0;
 }

+bool cuda_amr_restrict_compare_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_COMPARE");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool cuda_amr_restrict_batch_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_BATCH");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool cuda_device_segment_batch_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_DEVICE_SEGMENT_BATCH");
+    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+double cuda_amr_restrict_compare_tol()
+{
+  static double tol = -1.0;
+  if (tol < 0.0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_COMPARE_TOL");
+    tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-9;
+  }
+  return tol;
+}
+
+int cuda_amr_restrict_compare_limit()
+{
+  static int limit = -1;
+  if (limit < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_COMPARE_LIMIT");
+    limit = (env && atoi(env) > 0) ? atoi(env) : 8;
+  }
+  return limit;
+}
+
 bool cuda_mpi_diag_enabled()
 {
  static int enabled = -1;
@@ -543,7 +605,8 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
                                        const Parallel::gridseg *dst,
                                        int state_count,
                                        int type,
-                                        MyList<var> *VarLists)
+                                        MyList<var> *VarLists,
+                                        int Symmetry)
 {
 #if USE_CUDA_BSSN
  if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
@@ -584,6 +647,103 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
 	                   src->Bg, state_count, buffer, src->Bg->shape,
 	                   dst->shape[0], dst->shape[1], dst->shape[2],
                   first_fine[0], first_fine[1], first_fine[2]) == 0;
+    if (ok && cuda_amr_restrict_compare_enabled())
+    {
+      const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
+      const int total = state_count * region_all;
+      double *cpu = new double[total];
+      double *gpu = new double[total];
+      if (!cuda_download_resident_subset_to_host(src->Bg, VarLists, state_count))
+      {
+        delete[] cpu;
+        delete[] gpu;
+        return false;
+      }
+      int DIM = dim;
+      MyList<var> *v = VarLists;
+      for (int s = 0; s < state_count && v; ++s, v = v->next)
+      {
+        f_restrict3(DIM,
+                    const_cast<double *>(dst->llb),
+                    const_cast<double *>(dst->uub),
+                    const_cast<int *>(dst->shape),
+                    cpu + (size_t)s * region_all,
+                    src->Bg->bbox,
+                    src->Bg->bbox + dim,
+                    src->Bg->shape,
+                    src->Bg->fgfs[v->data->sgfn],
+                    const_cast<double *>(dst->llb),
+                    const_cast<double *>(dst->uub),
+                    v->data->SoA,
+                    Symmetry);
+      }
+      cudaError_t cerr = cudaMemcpy(gpu, buffer, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost);
+      if (cerr != cudaSuccess)
+      {
+        fprintf(stderr, "Parallel: restrict compare cudaMemcpy failed, err=%d\n", (int)cerr);
+        delete[] cpu;
+        delete[] gpu;
+        return false;
+      }
+      double max_abs = 0.0;
+      double max_rel = 0.0;
+      int max_idx = -1;
+      for (int i = 0; i < total; ++i)
+      {
+        const double diff = fabs(cpu[i] - gpu[i]);
+        const double den = fmax(fabs(cpu[i]), fabs(gpu[i]));
+        const double rel = den > 0.0 ? diff / den : diff;
+        if (diff > max_abs)
+        {
+          max_abs = diff;
+          max_rel = rel;
+          max_idx = i;
+        }
+      }
+      static int report_count = 0;
+      const double tol = cuda_amr_restrict_compare_tol();
+      int rank = 0;
+      MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+      if (max_abs > tol || max_rel > tol)
+      {
+        const int state = max_idx / region_all;
+        const int local = max_idx - state * region_all;
+        const int ii = local % dst->shape[0];
+        const int jj = (local / dst->shape[0]) % dst->shape[1];
+        const int kk = local / (dst->shape[0] * dst->shape[1]);
+        if (report_count < cuda_amr_restrict_compare_limit())
+        {
+          fprintf(stderr,
+                  "[AMSS-CUDA-RESTRICT-CMP][rank %d] mismatch state=%d point=(%d,%d,%d) "
+                  "shape=(%d,%d,%d) first_fine=(%d,%d,%d) max_abs=%.17e max_rel=%.17e "
+                  "cpu=%.17e gpu=%.17e src_lev=%d dst_lev=%d\n",
+                  rank, state, ii, jj, kk,
+                  dst->shape[0], dst->shape[1], dst->shape[2],
+                  first_fine[0], first_fine[1], first_fine[2],
+                  max_abs, max_rel, cpu[max_idx], gpu[max_idx],
+                  src->Bg->lev, dst->Bg->lev);
+          fflush(stderr);
+          report_count++;
+        }
+        delete[] cpu;
+        delete[] gpu;
+        return false;
+      }
+      else if (report_count < cuda_amr_restrict_compare_limit())
+      {
+        fprintf(stderr,
+                "[AMSS-CUDA-RESTRICT-CMP][rank %d] ok shape=(%d,%d,%d) "
+                "first_fine=(%d,%d,%d) max_abs=%.17e max_rel=%.17e\n",
+                rank,
+                dst->shape[0], dst->shape[1], dst->shape[2],
+                first_fine[0], first_fine[1], first_fine[2],
+                max_abs, max_rel);
+        fflush(stderr);
+        report_count++;
+      }
+      delete[] cpu;
+      delete[] gpu;
+    }
  }
  else if (type == 3)
  {
@@ -762,6 +922,10 @@ int cuda_data_packer_device_batched(double *data,
  (void)Symmetry;
  if (!data || (dir != PACK && dir != UNPACK) || !src || !dst)
    return -1;
+  if (!cuda_device_segment_batch_enabled())
+    return -1;
+  if (cuda_amr_restrict_compare_enabled())
+    return -1;

  int myrank;
  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
@@ -790,6 +954,8 @@ int cuda_data_packer_device_batched(double *data,
        type = 2;
      else
        type = 3;
+      if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled())
+        return -1;

      Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg;
      if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) ||
@@ -4819,7 +4985,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
              cuda_can_direct_pack(src->data, dst->data, type))
          {
            if (s_cuda_aware_pack_active) {
-              handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists);
+              handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
            } else {
              handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists);
            }