From c4d8d41b25c1dc36833e5d56dd86d440083e3cc9 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 7 May 2026 19:49:09 +0800 Subject: [PATCH] Cover Z4C CUDA AMR restrict prolong --- AMSS_NCKU_source/Parallel.C | 105 ++++++++++---- AMSS_NCKU_source/Z4c_class.C | 2 +- AMSS_NCKU_source/bssn_rhs_cuda.cu | 6 +- AMSS_NCKU_source/z4c_rhs_cuda.cu | 221 ++++++++++++++++++++++++++++++ AMSS_NCKU_source/z4c_rhs_cuda.h | 17 +++ makefile_and_run.py | 2 +- 6 files changed, 321 insertions(+), 32 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 8c16733..1a94d3c 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -174,6 +174,27 @@ int cuda_state_var_count(MyList *src_vars, MyList *dst_vars) return (src_vars || dst_vars) ? -1 : count; } +#if USE_CUDA_BSSN || USE_CUDA_Z4C +bool cuda_build_state_soa(MyList *vars, + int state_count, + double *soa_flat) +{ + if (!vars || !soa_flat || state_count <= 0) + return false; + MyList *v = vars; + for (int i = 0; i < state_count; ++i) + { + if (!v) + return false; + soa_flat[3 * i + 0] = v->data->SoA[0]; + soa_flat[3 * i + 1] = v->data->SoA[1]; + soa_flat[3 * i + 2] = v->data->SoA[2]; + v = v->next; + } + return v == 0; +} +#endif + #if USE_CUDA_BSSN bool cuda_build_bssn_host_views(Block *block, MyList *vars, @@ -201,17 +222,7 @@ bool cuda_build_bssn_soa(MyList *vars, if (!vars || !soa_flat || state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; - MyList *v = vars; - for (int i = 0; i < state_count; ++i) - { - if (!v) - return false; - soa_flat[3 * i + 0] = v->data->SoA[0]; - soa_flat[3 * i + 1] = v->data->SoA[1]; - soa_flat[3 * i + 2] = v->data->SoA[2]; - v = v->next; - } - return v == 0; + return cuda_build_state_soa(vars, state_count, soa_flat); } #endif @@ -234,7 +245,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const Parallel::gridseg *dst, int first_fine[3]) { -#if USE_CUDA_BSSN && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) +#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) #if ghost_width == 4 const int stencil_hi = 4; #else @@ -280,7 +291,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, int first_fine_ii[3], int coarse_lb[3]) { -#if USE_CUDA_BSSN && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) +#if (USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2))) && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) #if ghost_width == 4 const int stencil_hi = 4; #else @@ -355,9 +366,24 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg if (!src || !dst || !src->Bg) return false; #if USE_CUDA_Z4C && (ABEtype == 2) - if (type != 1) + if (z4c_cuda_has_resident_state(src->Bg) == 0) return false; - return z4c_cuda_has_resident_state(src->Bg) != 0; + if (type == 1) + return true; + int a[3], b[3]; + if (type == 2) + { + if (!cuda_amr_restrict_device_enabled()) + return false; + return cuda_cell_gw3_restrict_params(src, dst, a); + } + if (type == 3) + { + if (!cuda_amr_prolong_device_enabled()) + return false; + return cuda_cell_gw3_prolong_params(src, dst, a, b); + } + return false; #elif USE_CUDA_BSSN if (bssn_cuda_has_resident_state(src->Bg) == 0) return false; @@ -388,8 +414,6 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) if (type < 1 || type > 3 || !dst || !dst->Bg) return false; #if USE_CUDA_Z4C && (ABEtype == 2) - if (type != 1) - return false; return z4c_cuda_has_resident_state(dst->Bg) != 0; #elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(dst->Bg) != 0; @@ -431,7 +455,7 @@ bool cuda_direct_pack_segment(double *buffer, if (type == 2 || type == 3) { -#if USE_CUDA_BSSN +#if USE_CUDA_BSSN || (USE_CUDA_Z4C && (ABEtype == 2)) if (!cuda_amr_host_staged_enabled()) return false; const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2]; @@ -788,16 +812,43 @@ bool cuda_direct_pack_segment_to_device(double *buffer, #if USE_CUDA_Z4C && (ABEtype == 2) if (state_count == Z4C_CUDA_STATE_COUNT) { - if (type != 1) - return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; - const int i0 = cuda_seg_begin(dst, src->Bg, 0); - const int j0 = cuda_seg_begin(dst, src->Bg, 1); - const int k0 = cuda_seg_begin(dst, src->Bg, 2); - const bool ok = z4c_cuda_pack_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + bool ok = false; + double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; + const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat); + if (type == 1) + { + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + ok = z4c_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + } + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) + return false; + ok = z4c_cuda_restrict_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + have_soa ? soa_flat : 0) == 0; + } + else if (type == 3) + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) + return false; + ok = z4c_cuda_prolong_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + have_soa ? soa_flat : 0) == 0; + } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; return ok; diff --git a/AMSS_NCKU_source/Z4c_class.C b/AMSS_NCKU_source/Z4c_class.C index b306cc4..79ed62b 100644 --- a/AMSS_NCKU_source/Z4c_class.C +++ b/AMSS_NCKU_source/Z4c_class.C @@ -717,7 +717,7 @@ void Z4c_class::Step(int lev, int YN) } } - z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, false); #if (RPS == 0) RestrictProlong(lev, YN, BB); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 0c3f95e..ed63fae 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -7678,15 +7678,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest { double s = 1.0; if (x < 0) { - x = -x - 1; + x = -x; s *= d_comm_state_soa[3 * state_index + 0]; } if (y < 0) { - y = -y - 1; + y = -y; s *= d_comm_state_soa[3 * state_index + 1]; } if (z < 0) { - z = -z - 1; + z = -z; s *= d_comm_state_soa[3 * state_index + 2]; } const int src = x + y * nx + z * nx * ny; diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index 01b4dd8..36de461 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -422,6 +422,7 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { }; __constant__ int d_subset_state_indices[BSSN_STATE_COUNT]; +__constant__ double d_comm_state_soa[3 * BSSN_STATE_COUNT]; static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { 1, 1, 1, @@ -729,6 +730,21 @@ static void upload_grid_params_if_needed(const GridParams &gp) } } +static void upload_comm_state_soa(const double *state_soa, int state_count) +{ + double soa[3 * BSSN_STATE_COUNT]; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + soa[3 * i + 0] = 1.0; + soa[3 * i + 1] = 1.0; + soa[3 * i + 2] = 1.0; + } + if (state_soa) { + const int n = (state_count < BSSN_STATE_COUNT) ? state_count : BSSN_STATE_COUNT; + std::memcpy(soa, state_soa, (size_t)3 * n * sizeof(double)); + } + CUDA_CHECK(cudaMemcpyToSymbol(d_comm_state_soa, soa, sizeof(soa))); +} + /* ================================================================== */ /* A. Symmetry boundary kernels (ord=2 and ord=3) */ /* ================================================================== */ @@ -5182,6 +5198,157 @@ __global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem, } } +__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem, + int state_index, + int x, int y, int z, + int nx, int ny, + int all) +{ + double s = 1.0; + if (x < 0) { + x = -x; + s *= d_comm_state_soa[3 * state_index + 0]; + } + if (y < 0) { + y = -y; + s *= d_comm_state_soa[3 * state_index + 1]; + } + if (z < 0) { + z = -z; + s *= d_comm_state_soa[3 * state_index + 2]; + } + const int src = x + y * nx + z * nx * ny; + return s * src_mem[(size_t)state_index * all + src]; +} + +__global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; +#if ghost_width == 4 + const double c1 = -5.0 / 2048.0; + const double c2 = 49.0 / 2048.0; + const double c3 = -245.0 / 2048.0; + const double c4 = 1225.0 / 2048.0; + const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; + const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; + const int nst = 8; +#else + const double c1 = 3.0 / 256.0; + const double c2 = -25.0 / 256.0; + const double c3 = 75.0 / 128.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double w[6] = {c1, c2, c3, c3, c2, c1}; + const int nst = 6; +#endif + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fc_i = fi0 + 2 * ii; + const int fc_j = fj0 + 2 * jj; + const int fc_k = fk0 + 2 * kk; + double sum = 0.0; + for (int oz = 0; oz < nst; ++oz) { + const int z = fc_k + offs[oz]; + const double wz = w[oz]; + for (int oy = 0; oy < nst; ++oy) { + const int y = fc_j + offs[oy]; + const double wyz = wz * w[oy]; + for (int ox = 0; ox < nst; ++ox) { + const int x = fc_i + offs[ox]; + sum += wyz * w[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)state_index * region_all + local] = sum; + } +} + +__global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; +#if ghost_width == 4 + const double c1 = -495.0 / 262144.0; + const double c2 = 5005.0 / 262144.0; + const double c3 = -27027.0 / 262144.0; + const double c4 = 225225.0 / 262144.0; + const double c5 = 75075.0 / 262144.0; + const double c6 = -19305.0 / 262144.0; + const double c7 = 4095.0 / 262144.0; + const double c8 = -429.0 / 262144.0; + const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; + const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; + const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; + const int nst = 8; +#else + const double c1 = 77.0 / 8192.0; + const double c2 = -693.0 / 8192.0; + const double c3 = 3465.0 / 4096.0; + const double c4 = 1155.0 / 4096.0; + const double c5 = -495.0 / 8192.0; + const double c6 = 63.0 / 8192.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double wl[6] = {c1, c2, c3, c4, c5, c6}; + const double wr[6] = {c6, c5, c4, c3, c2, c1}; + const int nst = 6; +#endif + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fine_i = ii0 + ii; + const int fine_j = jj0 + jj; + const int fine_k = kk0 + kk; + const int ci = fine_i / 2 - lbc_i; + const int cj = fine_j / 2 - lbc_j; + const int ck = fine_k / 2 - lbc_k; + const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr; + const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; + const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; + double sum = 0.0; + for (int oz = 0; oz < nst; ++oz) { + const int z = ck + offs[oz]; + const double wzv = wz[oz]; + for (int oy = 0; oy < nst; ++oy) { + const int y = cj + offs[oy]; + const double wyz = wzv * wy[oy]; + for (int ox = 0; ox < nst; ++ox) { + const int x = ci + offs[ox]; + sum += wyz * wx[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)state_index * region_all + local] = sum; + } +} + __global__ void kern_pack_state_subset(const double * __restrict__ src_mem, double * __restrict__ dst, int subset_count, @@ -7604,6 +7771,60 @@ extern "C" int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, return 0; } +extern "C" int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_restrict_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + +extern "C" int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa) +{ + using namespace z4c_cuda; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_prolong_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + extern "C" int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.h b/AMSS_NCKU_source/z4c_rhs_cuda.h index 212965a..ecb6e90 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.h +++ b/AMSS_NCKU_source/z4c_rhs_cuda.h @@ -74,6 +74,23 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa); + +int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa); + int z4c_cuda_download_state_subset(void *block_tag, int *ex, int subset_count, diff --git a/makefile_and_run.py b/makefile_and_run.py index d0f936a..b8971d8 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -160,7 +160,7 @@ def _gpu_runtime_env(): "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1", } - if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar"): + if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"): defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1" if getattr(input_data, "Equation_Class", "") == "Z4C": defaults.update({