From 84865329203adef6e2118af42441423bc953fe76 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 30 Apr 2026 11:39:15 +0800 Subject: [PATCH] Add resident BSSN GPU point interpolation --- AMSS_NCKU_source/bssn_class.C | 77 +++++++----- AMSS_NCKU_source/bssn_rhs_cuda.cu | 200 ++++++++++++++++++++++++++++++ AMSS_NCKU_source/bssn_rhs_cuda.h | 19 +++ 3 files changed, 268 insertions(+), 28 deletions(-) diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 96d1fef..fdd6fa7 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -439,38 +439,56 @@ bool bssn_cuda_interp_bh_point_resident(MyList *PatL, if (bssn_cuda_has_resident_state(block) && block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) { - const int sx = ordn; - const int sy = ordn; - const int sz = ordn; - const int region_all = sx * sy * sz; - const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); - const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); - const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); - double packed_fields[3 * region_all]; var *vars[3] = {forx, fory, forz}; + double soa3[9]; for (int f = 0; f < 3; f++) { - if (bssn_cuda_pack_state_region_to_host_buffer(block, - k_bssn_cuda_bh_state_indices[f], - packed_fields + f * region_all, - block->shape, - i0, j0, k0, - sx, sy, sz) != 0) + soa3[3 * f + 0] = vars[f]->SoA[0]; + soa3[3 * f + 1] = vars[f]->SoA[1]; + soa3[3 * f + 2] = vars[f]->SoA[2]; + } + if (bssn_cuda_interp_state_point3(block, block->shape, + k_bssn_cuda_bh_state_indices[0], + k_bssn_cuda_bh_state_indices[1], + k_bssn_cuda_bh_state_indices[2], + block->X[0][0], block->X[1][0], block->X[2][0], + DH[0], DH[1], DH[2], + x, y, z, + interp_ordn, interp_sym, + soa3, shellf) != 0) + { + const int sx = ordn; + const int sy = ordn; + const int sz = ordn; + const int region_all = sx * sy * sz; + const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn); + const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn); + const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn); + double packed_fields[3 * region_all]; + for (int f = 0; f < 3; f++) { - cout << "CUDA BH tile download failed" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); + if (bssn_cuda_pack_state_region_to_host_buffer(block, + k_bssn_cuda_bh_state_indices[f], + packed_fields + f * region_all, + block->shape, + i0, j0, k0, + sx, sy, sz) != 0) + { + cout << "CUDA BH tile download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int tile_shape[3] = {sx, sy, sz}; + f_global_interp(tile_shape, + block->X[0] + i0, + block->X[1] + j0, + block->X[2] + k0, + packed_fields + f * region_all, + shellf[f], + x, y, z, + interp_ordn, + vars[f]->SoA, + interp_sym); } - int tile_shape[3] = {sx, sy, sz}; - f_global_interp(tile_shape, - block->X[0] + i0, - block->X[1] + j0, - block->X[2] + k0, - packed_fields + f * region_all, - shellf[f], - x, y, z, - interp_ordn, - vars[f]->SoA, - interp_sym); } } else @@ -3692,13 +3710,16 @@ void bssn_class::Step(int lev, int YN) } } + STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor); + // data analysis part // Warning NOTE: the variables1 are used as temp storege room if (lev == a_lev) { + STEP_TIMER_DECL(timer_analysis_surface); AnalysisStuff(lev, dT_lev); + STEP_TIMER_ADD(TB_ANALYSIS_SURFACE, timer_analysis_surface); } - STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor); #endif #ifdef With_AHF diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 29b4929..0135ad0 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -5618,6 +5618,153 @@ __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1, } } +__device__ double interp_lagrange_weight(int idx, double x, int ordn) +{ + double w = 1.0; + const double xi = (double)idx; + for (int j = 0; j < ordn; ++j) { + if (j == idx) continue; + w *= (x - (double)j) / (xi - (double)j); + } + return w; +} + +__device__ void interp_axis_window(double p, + double x0, + double dx, + int n, + int ordn, + int symmetry, + int axis, + int &base, + double &local_x) +{ + int cx_i = (int)((p - x0) / dx + 0.4) + 1; + int cx_b = cx_i - ordn / 2 + 1; + int cx_t = cx_b + ordn - 1; + int cmin = 1; + if (symmetry == 2 && axis < 2 && fabs(x0) < dx) + cmin = -ordn / 2 + 1; + if (symmetry != 0 && axis == 2 && fabs(x0) < dx) + cmin = -ordn / 2 + 1; + + if (cx_b < cmin) { + cx_b = cmin; + cx_t = cx_b + ordn - 1; + } + if (cx_t > n) { + cx_t = n; + cx_b = cx_t + 1 - ordn; + } + + base = cx_b; + if (cx_b > 0) { + const double xb = x0 + (double)(cx_b - 1) * dx; + local_x = (p - xb) / dx; + } else { + const int reflected = 1 - cx_b; + const double xb = x0 + (double)(reflected - 1) * dx; + local_x = (p + xb) / dx; + } +} + +__device__ double load_interp_value(const double * __restrict__ mem, + int nx, + int ny, + int nz, + int all, + int state, + int fi, + int fj, + int fk, + const double * __restrict__ soa) +{ + double sign = 1.0; + int ii = fi; + int jj = fj; + int kk = fk; + if (ii <= 0) { + ii = 1 - ii; + sign *= soa[0]; + } + if (jj <= 0) { + jj = 1 - jj; + sign *= soa[1]; + } + if (kk <= 0) { + kk = 1 - kk; + sign *= soa[2]; + } + if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz) + return 0.0; + const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny; + return sign * mem[(size_t)state * (size_t)all + (size_t)idx]; +} + +__global__ void kern_interp_state_point3(const double * __restrict__ mem, + double * __restrict__ out, + int nx, + int ny, + int nz, + int all, + int state0, + int state1, + int state2, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + double px, + double py, + double pz, + int ordn, + int symmetry, + double soa00, double soa01, double soa02, + double soa10, double soa11, double soa12, + double soa20, double soa21, double soa22) +{ + const int f = threadIdx.x; + if (f >= 3 || ordn <= 0 || ordn > 8) + return; + + const int states[3] = {state0, state1, state2}; + const double soa_all[9] = { + soa00, soa01, soa02, + soa10, soa11, soa12, + soa20, soa21, soa22 + }; + const double *soa = soa_all + 3 * f; + + int ib, jb, kb; + double tx, ty, tz; + interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx); + interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty); + interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz); + + double wx[8], wy[8], wz[8]; + for (int i = 0; i < ordn; ++i) { + wx[i] = interp_lagrange_weight(i, tx, ordn); + wy[i] = interp_lagrange_weight(i, ty, ordn); + wz[i] = interp_lagrange_weight(i, tz, ordn); + } + + double value = 0.0; + for (int k = 0; k < ordn; ++k) { + for (int j = 0; j < ordn; ++j) { + for (int i = 0; i < ordn; ++i) { + const double coeff = wx[i] * wy[j] * wz[k]; + value += coeff * load_interp_value(mem, nx, ny, nz, all, + states[f], + ib + i, jb + j, kb + k, + soa); + } + } + } + out[f] = value; +} + __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem, double * __restrict__ dst, int nx, int ny, @@ -6876,6 +7023,59 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_interp_state_point3(void *block_tag, + int *ex, + int state0, + int state1, + int state2, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + double px, + double py, + double pz, + int ordn, + int symmetry, + const double *soa3, + double *out3) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!block_tag || !ex || !out3 || !soa3) + return 1; + if (state0 < 0 || state0 >= BSSN_STATE_COUNT || + state1 < 0 || state1 >= BSSN_STATE_COUNT || + state2 < 0 || state2 >= BSSN_STATE_COUNT) + return 1; + if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 || + ordn <= 0 || ordn > 8 || + ex[0] < ordn || ex[1] < ordn || ex[2] < ordn) + return 1; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, nullptr, all, false); + if (bank < 0 || !ctx.resident_valid[bank]) + return 1; + + double *d_out = ensure_step_comm_buffer(ctx, 3); + kern_interp_state_point3<<<1, 3>>>( + ctx.d_resident_mem[bank], d_out, + ex[0], ex[1], ex[2], (int)all, + state0, state1, state2, + x0, y0, z0, dx, dy, dz, + px, py, pz, ordn, symmetry, + soa3[0], soa3[1], soa3[2], + soa3[3], soa3[4], soa3[5], + soa3[6], soa3[7], soa3[8]); + CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost)); + return 0; +} + extern "C" int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int state_index, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 3513dd7..c3a5195 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -83,6 +83,25 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag, int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_interp_state_point3(void *block_tag, + int *ex, + int state0, + int state1, + int state2, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + double px, + double py, + double pz, + int ordn, + int symmetry, + const double *soa3, + double *out3); + int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int state_index, double *host_buffer,