From 0076b3ca1828c5e37de8f670142a98800712366c Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 7 May 2026 19:22:37 +0800 Subject: [PATCH] Optimize 6th-order CUDA AMR stencils --- AMSS_NCKU_source/Parallel.C | 18 ++++-- AMSS_NCKU_source/bssn_rhs_cuda.cu | 98 ++++++++++++++++++++++++------- 2 files changed, 90 insertions(+), 26 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index b7f2a51..8c16733 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -234,7 +234,12 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const Parallel::gridseg *dst, int first_fine[3]) { -#if USE_CUDA_BSSN && defined(Cell) && (ghost_width == 3) +#if USE_CUDA_BSSN && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) +#if ghost_width == 4 + const int stencil_hi = 4; +#else + const int stencil_hi = 3; +#endif if (!src || !dst || !src->Bg || !dst->Bg) return false; for (int d = 0; d < dim; ++d) @@ -260,7 +265,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, first_fine[d] = 2 * lbc - lbf - 1; if (first_fine[d] < 0) return false; - if (first_fine[d] + 2 * (dst->shape[d] - 1) + 3 >= src->Bg->shape[d]) + if (first_fine[d] + 2 * (dst->shape[d] - 1) + stencil_hi >= src->Bg->shape[d]) return false; } return true; @@ -275,7 +280,12 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, int first_fine_ii[3], int coarse_lb[3]) { -#if USE_CUDA_BSSN && defined(Cell) && (ghost_width == 3) +#if USE_CUDA_BSSN && defined(Cell) && ((ghost_width == 3) || (ghost_width == 4)) +#if ghost_width == 4 + const int stencil_hi = 4; +#else + const int stencil_hi = 3; +#endif if (!src || !dst || !src->Bg || !dst->Bg) return false; for (int d = 0; d < dim; ++d) @@ -305,7 +315,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, const int last_coarse = last_fine_ii / 2 - coarse_lb[d]; if (first_coarse < -1) return false; - if (last_coarse + 3 >= src->Bg->shape[d]) + if (last_coarse + stencil_hi >= src->Bg->shape[d]) return false; } return true; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 2ee8a8b..0c3f95e 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -7622,11 +7622,22 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src { const int state_index = blockIdx.y; if (state_index >= state_count) return; +#if ghost_width == 4 + const double c1 = -5.0 / 2048.0; + const double c2 = 49.0 / 2048.0; + const double c3 = -245.0 / 2048.0; + const double c4 = 1225.0 / 2048.0; + const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; + const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; + const int nst = 8; +#else const double c1 = 3.0 / 256.0; const double c2 = -25.0 / 256.0; const double c3 = 75.0 / 128.0; const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double w[6] = {c1, c2, c3, c3, c2, c1}; + const int nst = 6; +#endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; local < region_all; @@ -7639,20 +7650,20 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src const int fc_j = fj0 + 2 * jj; const int fc_k = fk0 + 2 * kk; double sum = 0.0; - for (int oz = 0; oz < 6; ++oz) + for (int oz = 0; oz < nst; ++oz) { const int z = fc_k + offs[oz]; const double wz = w[oz]; - for (int oy = 0; oy < 6; ++oy) + for (int oy = 0; oy < nst; ++oy) { const int y = fc_j + offs[oy]; const double wyz = wz * w[oy]; - for (int ox = 0; ox < 6; ++ox) - { - const int x = fc_i + offs[ox]; - sum += wyz * w[ox] * - load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); - } + for (int ox = 0; ox < nst; ++ox) + { + const int x = fc_i + offs[ox]; + sum += wyz * w[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } } } dst[(size_t)state_index * region_all + local] = sum; @@ -7697,11 +7708,22 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s const int offset = m[4]; const int fi0 = m[5], fj0 = m[6], fk0 = m[7]; if (state_index >= state_count) return; +#if ghost_width == 4 + const double c1 = -5.0 / 2048.0; + const double c2 = 49.0 / 2048.0; + const double c3 = -245.0 / 2048.0; + const double c4 = 1225.0 / 2048.0; + const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; + const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1}; + const int nst = 8; +#else const double c1 = 3.0 / 256.0; const double c2 = -25.0 / 256.0; const double c3 = 75.0 / 128.0; const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double w[6] = {c1, c2, c3, c3, c2, c1}; + const int nst = 6; +#endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; local < region_all; @@ -7714,15 +7736,15 @@ __global__ void kern_restrict_state_segments_batch(const double * __restrict__ s const int fc_j = fj0 + 2 * jj; const int fc_k = fk0 + 2 * kk; double sum = 0.0; - for (int oz = 0; oz < 6; ++oz) + for (int oz = 0; oz < nst; ++oz) { const int z = fc_k + offs[oz]; const double wz = w[oz]; - for (int oy = 0; oy < 6; ++oy) + for (int oy = 0; oy < nst; ++oy) { const int y = fc_j + offs[oy]; const double wyz = wz * w[oy]; - for (int ox = 0; ox < 6; ++ox) + for (int ox = 0; ox < nst; ++ox) { const int x = fc_i + offs[ox]; sum += wyz * w[ox] * @@ -7746,6 +7768,20 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ { const int state_index = blockIdx.y; if (state_index >= state_count) return; +#if ghost_width == 4 + const double c1 = -495.0 / 262144.0; + const double c2 = 5005.0 / 262144.0; + const double c3 = -27027.0 / 262144.0; + const double c4 = 225225.0 / 262144.0; + const double c5 = 75075.0 / 262144.0; + const double c6 = -19305.0 / 262144.0; + const double c7 = 4095.0 / 262144.0; + const double c8 = -429.0 / 262144.0; + const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; + const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; + const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; + const int nst = 8; +#else const double c1 = 77.0 / 8192.0; const double c2 = -693.0 / 8192.0; const double c3 = 3465.0 / 4096.0; @@ -7755,6 +7791,8 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wr[6] = {c6, c5, c4, c3, c2, c1}; + const int nst = 6; +#endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; local < region_all; @@ -7773,15 +7811,15 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; double sum = 0.0; - for (int oz = 0; oz < 6; ++oz) + for (int oz = 0; oz < nst; ++oz) { const int z = ck + offs[oz]; const double wzv = wz[oz]; - for (int oy = 0; oy < 6; ++oy) + for (int oy = 0; oy < nst; ++oy) { const int y = cj + offs[oy]; const double wyz = wzv * wy[oy]; - for (int ox = 0; ox < 6; ++ox) + for (int ox = 0; ox < nst; ++ox) { const int x = ci + offs[ox]; sum += wyz * wx[ox] * @@ -7809,6 +7847,20 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr const int ii0 = m[5], jj0 = m[6], kk0 = m[7]; const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10]; if (state_index >= state_count) return; +#if ghost_width == 4 + const double c1 = -495.0 / 262144.0; + const double c2 = 5005.0 / 262144.0; + const double c3 = -27027.0 / 262144.0; + const double c4 = 225225.0 / 262144.0; + const double c5 = 75075.0 / 262144.0; + const double c6 = -19305.0 / 262144.0; + const double c7 = 4095.0 / 262144.0; + const double c8 = -429.0 / 262144.0; + const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4}; + const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8}; + const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1}; + const int nst = 8; +#else const double c1 = 77.0 / 8192.0; const double c2 = -693.0 / 8192.0; const double c3 = 3465.0 / 4096.0; @@ -7818,6 +7870,8 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr const int offs[6] = {-2, -1, 0, 1, 2, 3}; const double wl[6] = {c1, c2, c3, c4, c5, c6}; const double wr[6] = {c6, c5, c4, c3, c2, c1}; + const int nst = 6; +#endif for (int local = blockIdx.x * blockDim.x + threadIdx.x; local < region_all; @@ -7836,20 +7890,20 @@ __global__ void kern_prolong_state_segments_batch(const double * __restrict__ sr const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; double sum = 0.0; - for (int oz = 0; oz < 6; ++oz) + for (int oz = 0; oz < nst; ++oz) { const int z = ck + offs[oz]; const double wzv = wz[oz]; - for (int oy = 0; oy < 6; ++oy) + for (int oy = 0; oy < nst; ++oy) { const int y = cj + offs[oy]; const double wyz = wzv * wy[oy]; - for (int ox = 0; ox < 6; ++ox) - { - const int x = ci + offs[ox]; - sum += wyz * wx[ox] * - load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); - } + for (int ox = 0; ox < nst; ++ox) + { + const int x = ci + offs[ox]; + sum += wyz * wx[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } } } dst[(size_t)offset + (size_t)state_index * region_all + local] = sum;