Add resident BSSN GPU point interpolation

This commit is contained in:
2026-04-30 11:39:15 +08:00
parent 18e9c9cc50
commit 8486532920
3 changed files with 268 additions and 28 deletions

View File

@@ -439,38 +439,56 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
if (bssn_cuda_has_resident_state(block) &&
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{
const int sx = ordn;
const int sy = ordn;
const int sz = ordn;
const int region_all = sx * sy * sz;
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
double packed_fields[3 * region_all];
var *vars[3] = {forx, fory, forz};
double soa3[9];
for (int f = 0; f < 3; f++)
{
if (bssn_cuda_pack_state_region_to_host_buffer(block,
k_bssn_cuda_bh_state_indices[f],
packed_fields + f * region_all,
block->shape,
i0, j0, k0,
sx, sy, sz) != 0)
soa3[3 * f + 0] = vars[f]->SoA[0];
soa3[3 * f + 1] = vars[f]->SoA[1];
soa3[3 * f + 2] = vars[f]->SoA[2];
}
if (bssn_cuda_interp_state_point3(block, block->shape,
k_bssn_cuda_bh_state_indices[0],
k_bssn_cuda_bh_state_indices[1],
k_bssn_cuda_bh_state_indices[2],
block->X[0][0], block->X[1][0], block->X[2][0],
DH[0], DH[1], DH[2],
x, y, z,
interp_ordn, interp_sym,
soa3, shellf) != 0)
{
const int sx = ordn;
const int sy = ordn;
const int sz = ordn;
const int region_all = sx * sy * sz;
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
double packed_fields[3 * region_all];
for (int f = 0; f < 3; f++)
{
cout << "CUDA BH tile download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
if (bssn_cuda_pack_state_region_to_host_buffer(block,
k_bssn_cuda_bh_state_indices[f],
packed_fields + f * region_all,
block->shape,
i0, j0, k0,
sx, sy, sz) != 0)
{
cout << "CUDA BH tile download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int tile_shape[3] = {sx, sy, sz};
f_global_interp(tile_shape,
block->X[0] + i0,
block->X[1] + j0,
block->X[2] + k0,
packed_fields + f * region_all,
shellf[f],
x, y, z,
interp_ordn,
vars[f]->SoA,
interp_sym);
}
int tile_shape[3] = {sx, sy, sz};
f_global_interp(tile_shape,
block->X[0] + i0,
block->X[1] + j0,
block->X[2] + k0,
packed_fields + f * region_all,
shellf[f],
x, y, z,
interp_ordn,
vars[f]->SoA,
interp_sym);
}
}
else
@@ -3692,13 +3710,16 @@ void bssn_class::Step(int lev, int YN)
}
}
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
// data analysis part
// Warning NOTE: the variables1 are used as temp storege room
if (lev == a_lev)
{
STEP_TIMER_DECL(timer_analysis_surface);
AnalysisStuff(lev, dT_lev);
STEP_TIMER_ADD(TB_ANALYSIS_SURFACE, timer_analysis_surface);
}
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
#endif
#ifdef With_AHF

View File

@@ -5618,6 +5618,153 @@ __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
}
}
__device__ double interp_lagrange_weight(int idx, double x, int ordn)
{
double w = 1.0;
const double xi = (double)idx;
for (int j = 0; j < ordn; ++j) {
if (j == idx) continue;
w *= (x - (double)j) / (xi - (double)j);
}
return w;
}
__device__ void interp_axis_window(double p,
double x0,
double dx,
int n,
int ordn,
int symmetry,
int axis,
int &base,
double &local_x)
{
int cx_i = (int)((p - x0) / dx + 0.4) + 1;
int cx_b = cx_i - ordn / 2 + 1;
int cx_t = cx_b + ordn - 1;
int cmin = 1;
if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
cmin = -ordn / 2 + 1;
if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
cmin = -ordn / 2 + 1;
if (cx_b < cmin) {
cx_b = cmin;
cx_t = cx_b + ordn - 1;
}
if (cx_t > n) {
cx_t = n;
cx_b = cx_t + 1 - ordn;
}
base = cx_b;
if (cx_b > 0) {
const double xb = x0 + (double)(cx_b - 1) * dx;
local_x = (p - xb) / dx;
} else {
const int reflected = 1 - cx_b;
const double xb = x0 + (double)(reflected - 1) * dx;
local_x = (p + xb) / dx;
}
}
__device__ double load_interp_value(const double * __restrict__ mem,
int nx,
int ny,
int nz,
int all,
int state,
int fi,
int fj,
int fk,
const double * __restrict__ soa)
{
double sign = 1.0;
int ii = fi;
int jj = fj;
int kk = fk;
if (ii <= 0) {
ii = 1 - ii;
sign *= soa[0];
}
if (jj <= 0) {
jj = 1 - jj;
sign *= soa[1];
}
if (kk <= 0) {
kk = 1 - kk;
sign *= soa[2];
}
if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
return 0.0;
const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
}
__global__ void kern_interp_state_point3(const double * __restrict__ mem,
double * __restrict__ out,
int nx,
int ny,
int nz,
int all,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
double soa00, double soa01, double soa02,
double soa10, double soa11, double soa12,
double soa20, double soa21, double soa22)
{
const int f = threadIdx.x;
if (f >= 3 || ordn <= 0 || ordn > 8)
return;
const int states[3] = {state0, state1, state2};
const double soa_all[9] = {
soa00, soa01, soa02,
soa10, soa11, soa12,
soa20, soa21, soa22
};
const double *soa = soa_all + 3 * f;
int ib, jb, kb;
double tx, ty, tz;
interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
double wx[8], wy[8], wz[8];
for (int i = 0; i < ordn; ++i) {
wx[i] = interp_lagrange_weight(i, tx, ordn);
wy[i] = interp_lagrange_weight(i, ty, ordn);
wz[i] = interp_lagrange_weight(i, tz, ordn);
}
double value = 0.0;
for (int k = 0; k < ordn; ++k) {
for (int j = 0; j < ordn; ++j) {
for (int i = 0; i < ordn; ++i) {
const double coeff = wx[i] * wy[j] * wz[k];
value += coeff * load_interp_value(mem, nx, ny, nz, all,
states[f],
ib + i, jb + j, kb + k,
soa);
}
}
}
out[f] = value;
}
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
double * __restrict__ dst,
int nx, int ny,
@@ -6876,6 +7023,59 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
return 0;
}
extern "C"
int bssn_cuda_interp_state_point3(void *block_tag,
int *ex,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
const double *soa3,
double *out3)
{
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (!block_tag || !ex || !out3 || !soa3)
return 1;
if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
state1 < 0 || state1 >= BSSN_STATE_COUNT ||
state2 < 0 || state2 >= BSSN_STATE_COUNT)
return 1;
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
ordn <= 0 || ordn > 8 ||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
return 1;
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
if (bank < 0 || !ctx.resident_valid[bank])
return 1;
double *d_out = ensure_step_comm_buffer(ctx, 3);
kern_interp_state_point3<<<1, 3>>>(
ctx.d_resident_mem[bank], d_out,
ex[0], ex[1], ex[2], (int)all,
state0, state1, state2,
x0, y0, z0, dx, dy, dz,
px, py, pz, ordn, symmetry,
soa3[0], soa3[1], soa3[2],
soa3[3], soa3[4], soa3[5],
soa3[6], soa3[7], soa3[8]);
CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
return 0;
}
extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,

View File

@@ -83,6 +83,25 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_interp_state_point3(void *block_tag,
int *ex,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
const double *soa3,
double *out3);
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,
double *host_buffer,