Checkpoint Z4C CUDA resident sync progress

2026-05-02 10:53:52 +08:00
parent ba61702fc0
commit 52beb4d153
4 changed files with 1075 additions and 134 deletions
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -424,14 +424,7 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var>
    return false;
  if (z4c_cuda_has_resident_state(dst->Bg) == 0)
    return false;
-  if (type != 1 && VarListd)
+  (void)VarListd;
  {
    double *view_ptrs[Z4C_CUDA_STATE_COUNT];
    if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
      return false;
    if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
      return false;
  }
  return true;
 #elif USE_CUDA_BSSN
  return bssn_cuda_has_resident_state(dst->Bg) != 0;
@@ -460,9 +453,16 @@ bool cuda_direct_pack_segment(double *buffer,
  const int j0 = cuda_seg_begin(dst, src->Bg, 1);
  const int k0 = cuda_seg_begin(dst, src->Bg, 2);
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
+  double *views[Z4C_CUDA_STATE_COUNT];
-                                                           i0, j0, k0,
+  const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
-                                                           dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  const bool ok = have_views
                      ? z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(
                            src->Bg, views, state_count, buffer, src->Bg->shape,
                            i0, j0, k0,
                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
                      : z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
                                                                 i0, j0, k0,
                                                                 dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
  double *views[BSSN_CUDA_STATE_COUNT];
  const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
@@ -500,9 +500,16 @@ bool cuda_direct_unpack_segment(double *buffer,
  const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
  const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
 #if USE_CUDA_Z4C && (ABEtype == 2)
-  const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
+  double *views[Z4C_CUDA_STATE_COUNT];
-                                                               i0, j0, k0,
+  const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
-                                                               dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+  const bool ok = have_views
                      ? z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(
                            dst->Bg, views, state_count, buffer, dst->Bg->shape,
                            i0, j0, k0,
                            dst->shape[0], dst->shape[1], dst->shape[2]) == 0
                      : z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
                                                                     i0, j0, k0,
                                                                     dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
 #else
  double *views[BSSN_CUDA_STATE_COUNT];
  const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
@@ -703,40 +710,60 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
  {
    const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
    bool ok = false;
    double *views[Z4C_CUDA_STATE_COUNT];
    double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
    const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
    const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
    if (type == 1)
    {
      const int i0 = cuda_seg_begin(dst, src->Bg, 0);
      const int j0 = cuda_seg_begin(dst, src->Bg, 1);
      const int k0 = cuda_seg_begin(dst, src->Bg, 2);
-      ok = z4c_cuda_pack_state_batch_to_device_buffer(
+      ok = have_views
-               src->Bg, state_count, buffer, src->Bg->shape,
+               ? z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(
-               i0, j0, k0,
+                     src->Bg, views, state_count, buffer, src->Bg->shape,
-               dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+                     i0, j0, k0,
                     dst->shape[0], dst->shape[1], dst->shape[2]) == 0
               : z4c_cuda_pack_state_batch_to_device_buffer(
                     src->Bg, state_count, buffer, src->Bg->shape,
                     i0, j0, k0,
                     dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
    }
    else if (type == 2)
    {
      int first_fine[3];
      if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
        return false;
-      ok = z4c_cuda_restrict_state_batch_to_device_buffer(
+      ok = have_views
-               src->Bg, state_count, buffer, src->Bg->shape,
+               ? z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(
-               dst->shape[0], dst->shape[1], dst->shape[2],
+                     src->Bg, views, state_count, buffer, src->Bg->shape,
-               first_fine[0], first_fine[1], first_fine[2],
+                     dst->shape[0], dst->shape[1], dst->shape[2],
-               have_soa ? soa_flat : 0) == 0;
+                     first_fine[0], first_fine[1], first_fine[2],
                     have_soa ? soa_flat : 0) == 0
               : z4c_cuda_restrict_state_batch_to_device_buffer(
                     src->Bg, state_count, buffer, src->Bg->shape,
                     dst->shape[0], dst->shape[1], dst->shape[2],
                     first_fine[0], first_fine[1], first_fine[2],
                     have_soa ? soa_flat : 0) == 0;
    }
    else if (type == 3)
    {
      int first_fine_ii[3], coarse_lb[3];
      if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
        return false;
-      ok = z4c_cuda_prolong_state_batch_to_device_buffer(
+      ok = have_views
-               src->Bg, state_count, buffer, src->Bg->shape,
+               ? z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(
-               dst->shape[0], dst->shape[1], dst->shape[2],
+                     src->Bg, views, state_count, buffer, src->Bg->shape,
-               first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
+                     dst->shape[0], dst->shape[1], dst->shape[2],
-               coarse_lb[0], coarse_lb[1], coarse_lb[2],
+                     first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
-               have_soa ? soa_flat : 0) == 0;
+                     coarse_lb[0], coarse_lb[1], coarse_lb[2],
                     have_soa ? soa_flat : 0) == 0
               : z4c_cuda_prolong_state_batch_to_device_buffer(
                     src->Bg, state_count, buffer, src->Bg->shape,
                     dst->shape[0], dst->shape[1], dst->shape[2],
                     first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
                     coarse_lb[0], coarse_lb[1], coarse_lb[2],
                     have_soa ? soa_flat : 0) == 0;
    }
    if (sync_profile_enabled())
      sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -919,10 +946,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
    const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
    const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
    const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
-    const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer(
+    double *views[Z4C_CUDA_STATE_COUNT];
-        dst->Bg, state_count, buffer, dst->Bg->shape,
+    const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
-        i0, j0, k0,
+    const bool ok = have_views
-        dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
+                        ? z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(
                              dst->Bg, views, state_count, buffer, dst->Bg->shape,
                              i0, j0, k0,
                              dst->shape[0], dst->shape[1], dst->shape[2]) == 0
                        : z4c_cuda_unpack_state_batch_from_device_buffer(
                              dst->Bg, state_count, buffer, dst->Bg->shape,
                              i0, j0, k0,
                              dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
    if (sync_profile_enabled())
      sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
    return ok;
@@ -1074,23 +1108,39 @@ bool cuda_flush_device_segment_batch(Block *block,
 #if USE_CUDA_Z4C && (ABEtype == 2)
  if (state_count == Z4C_CUDA_STATE_COUNT)
  {
    double *views[Z4C_CUDA_STATE_COUNT];
    double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
    const bool have_views = cuda_build_z4c_host_views(block, vars, state_count, views);
    const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat);
    if (dir == PACK)
    {
      if (type == 2)
-        return z4c_cuda_restrict_state_segments_to_device_buffer(
+        return have_views
-                   block, state_count, data, block->shape, segment_count,
+                   ? z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(
-                   meta.data(), have_soa ? soa_flat : 0) == 0;
+                         block, views, state_count, data, block->shape, segment_count,
                         meta.data(), have_soa ? soa_flat : 0) == 0
                   : z4c_cuda_restrict_state_segments_to_device_buffer(
                         block, state_count, data, block->shape, segment_count,
                         meta.data(), have_soa ? soa_flat : 0) == 0;
      if (type == 3)
-        return z4c_cuda_prolong_state_segments_to_device_buffer(
+        return have_views
-                   block, state_count, data, block->shape, segment_count,
+                   ? z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(
-                   meta.data(), have_soa ? soa_flat : 0) == 0;
+                         block, views, state_count, data, block->shape, segment_count,
-      return z4c_cuda_pack_state_segments_to_device_buffer(
+                         meta.data(), have_soa ? soa_flat : 0) == 0
-                 block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+                   : z4c_cuda_prolong_state_segments_to_device_buffer(
                         block, state_count, data, block->shape, segment_count,
                         meta.data(), have_soa ? soa_flat : 0) == 0;
      return have_views
                 ? z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(
                       block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
                 : z4c_cuda_pack_state_segments_to_device_buffer(
                       block, state_count, data, block->shape, segment_count, meta.data()) == 0;
    }
-    return z4c_cuda_unpack_state_segments_from_device_buffer(
+    return have_views
-               block, state_count, data, block->shape, segment_count, meta.data()) == 0;
+               ? z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(
                     block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
               : z4c_cuda_unpack_state_segments_from_device_buffer(
                     block, state_count, data, block->shape, segment_count, meta.data()) == 0;
  }
 #endif
 #if USE_CUDA_BSSN
@@ -5294,7 +5344,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
 	                         dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
 	                         dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
 	            }
-	          }
+          }
          if (dir == UNPACK) // from target data to corresponding grid
          {
            f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
--- a/AMSS_NCKU_source/Z4c_class.C
+++ b/AMSS_NCKU_source/Z4c_class.C
@@ -388,41 +388,57 @@ bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
          if (z4c_cuda_has_resident_state(block) &&
              block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
          {
            const int sx = ordn;
            const int sy = ordn;
            const int sz = ordn;
            const int region_all = sx * sy * sz;
            const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
            const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
            const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
            double *packed_fields = new double[3 * region_all];
            var *vars[3] = {forx, fory, forz};
-            for (int f = 0; f < 3; f++)
+            static int use_device_bh_interp = -1;
            if (use_device_bh_interp < 0)
            {
-              if (z4c_cuda_pack_state_region_to_host_buffer(block,
+              const char *env = getenv("AMSS_CUDA_Z4C_BH_INTERP_DEVICE");
-                                                            k_z4c_cuda_bh_state_indices[f],
+              use_device_bh_interp = (env && atoi(env) != 0) ? 1 : 0;
-                                                            packed_fields + f * region_all,
+            }
-                                                            block->shape,
+            bool used_device_interp = false;
-                                                            i0, j0, k0,
+            if (use_device_bh_interp)
-                                                            sx, sy, sz) != 0)
+            {
              double soa3[9];
              for (int f = 0; f < 3; f++)
              {
-                delete[] packed_fields;
+                soa3[3 * f + 0] = vars[f]->SoA[0];
-                cout << "CUDA Z4C BH tile download failed" << endl;
+                soa3[3 * f + 1] = vars[f]->SoA[1];
                soa3[3 * f + 2] = vars[f]->SoA[2];
              }
              used_device_interp =
                  (z4c_cuda_interp_state_point3(block, block->shape,
                                                k_z4c_cuda_bh_state_indices[0],
                                                k_z4c_cuda_bh_state_indices[1],
                                                k_z4c_cuda_bh_state_indices[2],
                                                block->X[0][0], block->X[1][0], block->X[2][0],
                                                DH[0], DH[1], DH[2],
                                                x, y, z,
                                                interp_ordn, interp_sym,
                                                soa3, shellf) == 0);
            }
            if (!used_device_interp)
            {
              double *shift_views[3] = {
                  block->fgfs[forx->sgfn],
                  block->fgfs[fory->sgfn],
                  block->fgfs[forz->sgfn]};
              if (z4c_cuda_download_state_subset(block, block->shape, 3,
                                                 k_z4c_cuda_bh_state_indices,
                                                 shift_views) != 0)
              {
                cout << "CUDA Z4C BH shift download failed" << endl;
                MPI_Abort(MPI_COMM_WORLD, 1);
              }
-              int tile_shape[3] = {sx, sy, sz};
+              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
-              f_global_interp(tile_shape,
+                              block->fgfs[forx->sgfn], shellf[0],
-                              block->X[0] + i0,
+                              x, y, z, interp_ordn, forx->SoA, interp_sym);
-                              block->X[1] + j0,
+              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
-                              block->X[2] + k0,
+                              block->fgfs[fory->sgfn], shellf[1],
-                              packed_fields + f * region_all,
+                              x, y, z, interp_ordn, fory->SoA, interp_sym);
-                              shellf[f],
+              f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
-                              x, y, z,
+                              block->fgfs[forz->sgfn], shellf[2],
-                              interp_ordn,
+                              x, y, z, interp_ordn, forz->SoA, interp_sym);
                              vars[f]->SoA,
                              interp_sym);
            }
            delete[] packed_fields;
          }
          else
          {
--- a/AMSS_NCKU_source/z4c_rhs_cuda.cu
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu
--- a/AMSS_NCKU_source/z4c_rhs_cuda.h
+++ b/AMSS_NCKU_source/z4c_rhs_cuda.h
@@ -53,6 +53,14 @@ int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
                                             int i0, int j0, int k0,
                                             int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
                                                            double **state_host_key,
                                                            int state_count,
                                                            double *host_buffer,
                                                            int *ex,
                                                            int i0, int j0, int k0,
                                                            int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                 int state_count,
                                                 double *host_buffer,
@@ -60,6 +68,14 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
                                                 int i0, int j0, int k0,
                                                 int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
                                                                double **state_host_key,
                                                                int state_count,
                                                                double *host_buffer,
                                                                int *ex,
                                                                int i0, int j0, int k0,
                                                                int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                               int state_count,
                                               double *device_buffer,
@@ -67,6 +83,14 @@ int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
                                               int i0, int j0, int k0,
                                               int sx, int sy, int sz);
 int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                              double **state_host_key,
                                                              int state_count,
                                                              double *device_buffer,
                                                              int *ex,
                                                              int i0, int j0, int k0,
                                                              int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
@@ -74,6 +98,14 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
                                                   int i0, int j0, int k0,
                                                   int sx, int sy, int sz);
 int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int i0, int j0, int k0,
                                                                  int sx, int sy, int sz);
 int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                  int state_count,
                                                  double *device_buffer,
@@ -81,6 +113,14 @@ int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
                                                  int segment_count,
                                                  const int *segment_meta);
 int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *device_buffer,
                                                                 int *ex,
                                                                 int segment_count,
                                                                 const int *segment_meta);
 int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
@@ -88,6 +128,14 @@ int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
                                                      int segment_count,
                                                      const int *segment_meta);
 int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta);
 int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                      int state_count,
                                                      double *device_buffer,
@@ -96,6 +144,15 @@ int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
                                                      const int *segment_meta,
                                                      const double *state_soa);
 int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                     double **state_host_key,
                                                                     int state_count,
                                                                     double *device_buffer,
                                                                     int *ex,
                                                                     int segment_count,
                                                                     const int *segment_meta,
                                                                     const double *state_soa);
 int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                     int state_count,
                                                     double *device_buffer,
@@ -104,6 +161,15 @@ int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
                                                     const int *segment_meta,
                                                     const double *state_soa);
 int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
                                                                    double **state_host_key,
                                                                    int state_count,
                                                                    double *device_buffer,
                                                                    int *ex,
                                                                    int segment_count,
                                                                    const int *segment_meta,
                                                                    const double *state_soa);
 int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                   int state_count,
                                                   double *device_buffer,
@@ -112,6 +178,15 @@ int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
                                                   int fi0, int fj0, int fk0,
                                                   const double *state_soa);
 int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                  double **state_host_key,
                                                                  int state_count,
                                                                  double *device_buffer,
                                                                  int *ex,
                                                                  int sx, int sy, int sz,
                                                                  int fi0, int fj0, int fk0,
                                                                  const double *state_soa);
 int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                  int state_count,
                                                  double *device_buffer,
@@ -121,6 +196,16 @@ int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
                                                  int lbc_i, int lbc_j, int lbc_k,
                                                  const double *state_soa);
 int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
                                                                 double **state_host_key,
                                                                 int state_count,
                                                                 double *device_buffer,
                                                                 int *ex,
                                                                 int sx, int sy, int sz,
                                                                 int ii0, int jj0, int kk0,
                                                                 int lbc_i, int lbc_j, int lbc_k,
                                                                 const double *state_soa);
 int z4c_cuda_download_state_subset(void *block_tag,
                                   int *ex,
                                   int subset_count,
@@ -138,6 +223,25 @@ int z4c_cuda_compute_constraints_resident(void *block_tag,
                                          int Symmetry, double eps, int co,
                                          double **constraint_host_out);
 int z4c_cuda_interp_state_point3(void *block_tag,
                                 int *ex,
                                 int state0,
                                 int state1,
                                 int state2,
                                 double x0,
                                 double y0,
                                 double z0,
                                 double dx,
                                 double dy,
                                 double dz,
                                 double px,
                                 double py,
                                 double pz,
                                 int ordn,
                                 int symmetry,
                                 const double *soa3,
                                 double *out3);
 int z4c_cuda_download_constraint_outputs(int *ex,
                                         double **constraint_host_out);