From b78874ef2113ac1766c9837d6f845b402ed587da Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Fri, 10 Apr 2026 23:37:36 +0800 Subject: [PATCH] Refine stable GPU AMR staging path --- AMSS_NCKU_source/bssn_cuda_ops.cu | 219 +++++++++++++++++++----------- 1 file changed, 139 insertions(+), 80 deletions(-) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 080ae6f..fa4e9cf 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -262,6 +262,65 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr return true; } +inline bool copy_region_to_padded_stage(CachedBuffer &dst, + const double *src, + const int src_shape[3], + int src_i0, int src_j0, int src_k0, + int region_nx, int region_ny, int region_nz, + int dst_sx, int dst_sy, int dst_sz, + int dst_x0, int dst_y0, int dst_z0, + const char *where) +{ + if (!src || !where || region_nx <= 0 || region_ny <= 0 || region_nz <= 0 || + dst_sx <= 0 || dst_sy <= 0 || dst_sz <= 0) + { + return false; + } + + const size_t dst_bytes = + static_cast(dst_sx) * static_cast(dst_sy) * static_cast(dst_sz) * sizeof(double); + if (!ensure_capacity(dst, dst_bytes)) + return false; + + const double *device_src = bssn_gpu_find_device_buffer(src); + const void *copy_src = device_src ? static_cast(device_src) : static_cast(src); + cudaMemcpyKind kind = device_src ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; + + if (!device_src) + { + const int src_count = src_shape[0] * src_shape[1] * src_shape[2]; + bssn_gpu_prepare_host_buffer(src, src_count); + } + + cudaMemcpy3DParms parms = {}; + parms.srcPtr = make_cudaPitchedPtr(const_cast(copy_src), + static_cast(src_shape[0]) * sizeof(double), + static_cast(src_shape[0]), + static_cast(src_shape[1])); + parms.dstPtr = make_cudaPitchedPtr(dst.ptr, + static_cast(dst_sx) * sizeof(double), + static_cast(dst_sx), + static_cast(dst_sy)); + parms.srcPos = make_cudaPos(static_cast(src_i0) * sizeof(double), + static_cast(src_j0), + static_cast(src_k0)); + parms.dstPos = make_cudaPos(static_cast(dst_x0) * sizeof(double), + static_cast(dst_y0), + static_cast(dst_z0)); + parms.extent = make_cudaExtent(static_cast(region_nx) * sizeof(double), + static_cast(region_ny), + static_cast(region_nz)); + parms.kind = kind; + + cudaError_t err = cudaMemcpy3D(&parms); + if (err != cudaSuccess) + { + report_cuda_error(where, err); + return false; + } + return true; +} + inline int interp_idint_like_host(double x) { return static_cast(x); @@ -508,6 +567,30 @@ prolong3_cell_kernel_next_point: } } +__global__ void prolong3_fill_equatorial_reflect_kernel(double *funcc, + int sxc, int syc, + int nxc, int nyc, + double z_soa) +{ + const int n = nxc * nyc * 3; + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + const int plane = nxc * nyc; + const int zoff = idx / plane; + const int rem = idx - zoff * plane; + const int j = rem / nxc; + const int i = rem - j * nxc; + + const int li = i + 1; + const int lj = j + 1; + const int source_k = zoff + 1; + const int target_k = -zoff; + + funcc[(li + 2) + (lj + 2) * sxc + (target_k + 2) * sxc * syc] = + z_soa * funcc[(li + 2) + (lj + 2) * sxc + (source_k + 2) * sxc * syc]; + } +} + __global__ void restrict3_cell_kernel(const double *funf, double *func, int sxf, int syf, int nxf, int nyf, int nzf, @@ -1442,10 +1525,13 @@ int bssn_cuda_prolong3_pack(int wei, return 1; } - // Keep thin prolongation slabs on the established CPU path for now. - // The GPU kernel is only enabled for chunkier regions where the current - // device pack path is known to be safe. - if (extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei) + const bool thin_x = (extf[0] <= 2 * wei); + const bool thin_y = (extf[1] <= 2 * wei); + const bool thin_z = (extf[2] <= 2 * wei); + // Keep thin slabs on the CPU path until the dedicated slab kernel is + // fully validated. The first z-thin specialization still triggers illegal + // accesses on the common extf=(48,48,6) case. + if (thin_x || thin_y || thin_z) return 1; auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int { @@ -1480,7 +1566,6 @@ int bssn_cuda_prolong3_pack(int wei, }; static thread_local ProlongCache cache; - int src_nxc = extc[0], src_nyc = extc[1], src_nzc = extc[2]; int nxf = extf[0], nyf = extf[1], nzf = extf[2]; const int stage_ix_lo = ic_min - 2; const int stage_ix_hi = ic_max + 3; @@ -1494,59 +1579,50 @@ int bssn_cuda_prolong3_pack(int wei, int sxc = nxc + 3; int syc = nyc + 3; int szc = nzc + 3; - int coarse_points = sxc * syc * szc; int fine_points = nxf * nyf * nzf; - const size_t coarse_bytes = static_cast(coarse_points) * sizeof(double); const size_t fine_bytes = static_cast(fine_points) * sizeof(double); - - std::vector funcc_host(static_cast(coarse_points), 0.0); - auto coarse_index = [sxc, syc](int fi, int fj, int fk) -> size_t { - return static_cast(fi + 2) + - static_cast(fj + 2) * static_cast(sxc) + - static_cast(fk + 2) * static_cast(sxc) * static_cast(syc); - }; - auto func_index = [src_nxc, src_nyc](int i, int j, int k) -> size_t { - return static_cast(i - 1) + - static_cast(j - 1) * static_cast(src_nxc) + - static_cast(k - 1) * static_cast(src_nxc) * static_cast(src_nyc); - }; - - for (int k = stage_iz_lo; k <= stage_iz_hi; ++k) + if (!copy_region_to_padded_stage(cache.coarse, + func, + extc, + stage_ix_lo - 1, + stage_iy_lo - 1, + stage_iz_lo - 1, + nxc, + nyc, + nzc, + sxc, + syc, + szc, + 3, + 3, + 3, + "cudaMemcpy3D prolong3 stage")) { - const int lk = k - stage_iz_lo + 1; - for (int j = stage_iy_lo; j <= stage_iy_hi; ++j) - { - const int lj = j - stage_iy_lo + 1; - for (int i = stage_ix_lo; i <= stage_ix_hi; ++i) - { - const int li = i - stage_ix_lo + 1; - funcc_host[coarse_index(li, lj, lk)] = func[func_index(i, j, k)]; - } - } + return 1; } + double *d_func = cache.coarse.ptr; + if (need_equatorial_z_reflect) { - for (int offset = 0; offset < 3; ++offset) + dim3 reflect_block(256); + dim3 reflect_grid(div_up(nxc * nyc * 3, static_cast(reflect_block.x))); + double z_soa = SoA[2]; + void *reflect_args[] = { + &d_func, + &sxc, + &syc, + &nxc, + &nyc, + &z_soa}; + if (!launch_kernel(reflect_grid, reflect_block, + (const void *)prolong3_fill_equatorial_reflect_kernel, + reflect_args)) { - const int target_k = -offset; - const int source_k = offset + 1; - for (int j = 1; j <= nyc; ++j) - { - for (int i = 1; i <= nxc; ++i) - { - funcc_host[coarse_index(i, j, target_k)] = - funcc_host[coarse_index(i, j, source_k)] * SoA[2]; - } - } + return 1; } } - double *d_func = nullptr; - if (!copy_to_device(cache.coarse, funcc_host.data(), coarse_bytes)) - return 1; - d_func = cache.coarse.ptr; - if (!ensure_capacity(cache.fine, fine_bytes)) { return 1; @@ -1742,9 +1818,6 @@ int bssn_cuda_restrict3_pack(int wei, int nxc = extc[0]; int nyc = extc[1]; int nzc = extc[2]; - int src_nxf = extf[0]; - int src_nyf = extf[1]; - int src_nzf = extf[2]; const int stage_fx_lo = ii_lo; const int stage_fx_hi = ii_hi; const int stage_fy_lo = jj_lo; @@ -1757,40 +1830,26 @@ int bssn_cuda_restrict3_pack(int wei, int sxf = nxf + 2; int syf = nyf + 2; int szf = nzf + 2; - int fine_points = sxf * syf * szf; int coarse_points = nxc * nyc * nzc; - const size_t fine_bytes = static_cast(fine_points) * sizeof(double); const size_t coarse_bytes = static_cast(coarse_points) * sizeof(double); - - std::vector funff_host(static_cast(fine_points), 0.0); - auto fine_index = [sxf, syf](int fi, int fj, int fk) -> size_t { - return static_cast(fi + 1) + - static_cast(fj + 1) * static_cast(sxf) + - static_cast(fk + 1) * static_cast(sxf) * static_cast(syf); - }; - auto src_index = [src_nxf, src_nyf](int i, int j, int k) -> size_t { - return static_cast(i - 1) + - static_cast(j - 1) * static_cast(src_nxf) + - static_cast(k - 1) * static_cast(src_nxf) * static_cast(src_nyf); - }; - - for (int k = stage_fz_lo; k <= stage_fz_hi; ++k) - { - const int lk = k - stage_fz_lo + 1; - for (int j = stage_fy_lo; j <= stage_fy_hi; ++j) - { - const int lj = j - stage_fy_lo + 1; - for (int i = stage_fx_lo; i <= stage_fx_hi; ++i) - { - const int li = i - stage_fx_lo + 1; - funff_host[fine_index(li, lj, lk)] = funf[src_index(i, j, k)]; - } - } - } - (void)need_equatorial_z_reflect; - if (!copy_to_device(cache.fine, funff_host.data(), fine_bytes)) + if (!copy_region_to_padded_stage(cache.fine, + funf, + extf, + stage_fx_lo - 1, + stage_fy_lo - 1, + stage_fz_lo - 1, + nxf, + nyf, + nzf, + sxf, + syf, + szf, + 2, + 2, + 2, + "cudaMemcpy3D restrict3 stage")) return 1; if (!ensure_capacity(cache.coarse, coarse_bytes)) return 1;