Add zero matter handling and interpolation for resident state in CUDA BSSN

This commit is contained in:
2026-04-15 00:25:53 +08:00
parent 9ab7e7c7f9
commit 8fe60ea703
3 changed files with 209 additions and 11 deletions

View File

@@ -112,6 +112,182 @@ bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool u
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
}
bool bssn_cuda_patch_contains_point(Patch *patch, const double *point)
{
if (!patch)
return false;
for (int d = 0; d < dim; d++)
{
const double h = patch->getdX(d);
const double lo = patch->bbox[d] + patch->lli[d] * h;
const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
if (point[d] < lo || point[d] > hi)
return false;
}
return true;
}
bool bssn_cuda_point_in_block(Patch *patch, Block *block,
const double *point, const double *DH)
{
if (!patch || !block)
return false;
for (int d = 0; d < dim; d++)
{
double llb;
double uub;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
? block->bbox[d] + patch->lli[d] * DH[d]
: block->bbox[d] + (ghost_width - 0.5) * DH[d];
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
? block->bbox[dim + d] - patch->uui[d] * DH[d]
: block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
#else
#ifdef Cell
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
? block->bbox[d] + patch->lli[d] * DH[d]
: block->bbox[d] + ghost_width * DH[d];
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
? block->bbox[dim + d] - patch->uui[d] * DH[d]
: block->bbox[dim + d] - ghost_width * DH[d];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
return false;
}
return true;
}
int bssn_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
{
if (!coords || n <= ordn)
return 0;
int cxi = int((x - coords[0]) / dx + 0.4) + 1;
int start = cxi - ordn / 2;
if (start < 0)
start = 0;
const int max_start = n - ordn;
if (start > max_start)
start = max_start;
return start;
}
bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
int myrank,
const double *point,
var *forx, var *fory, var *forz,
int Symmetry,
double *shellf)
{
const int ordn = 2 * ghost_width;
int owner_rank = -1;
shellf[0] = shellf[1] = shellf[2] = 0.0;
MyList<Patch> *PL = PatL;
while (PL)
{
Patch *patch = PL->data;
if (!bssn_cuda_patch_contains_point(patch, point))
{
PL = PL->next;
continue;
}
double DH[dim];
for (int d = 0; d < dim; d++)
DH[d] = patch->getdX(d);
MyList<Block> *BP = patch->blb;
while (BP)
{
Block *block = BP->data;
if (bssn_cuda_point_in_block(patch, block, point, DH))
{
owner_rank = block->rank;
if (myrank == owner_rank)
{
int interp_ordn = ordn;
int interp_sym = Symmetry;
double x = point[0];
double y = point[1];
double z = point[2];
if (bssn_cuda_has_resident_state(block) &&
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{
const int sx = ordn;
const int sy = ordn;
const int sz = ordn;
const int region_all = sx * sy * sz;
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
double packed_fields[3 * region_all];
var *vars[3] = {forx, fory, forz};
for (int f = 0; f < 3; f++)
{
if (bssn_cuda_pack_state_region_to_host_buffer(block,
k_bssn_cuda_bh_state_indices[f],
packed_fields + f * region_all,
block->shape,
i0, j0, k0,
sx, sy, sz) != 0)
{
cout << "CUDA BH tile download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int tile_shape[3] = {sx, sy, sz};
f_global_interp(tile_shape,
block->X[0] + i0,
block->X[1] + j0,
block->X[2] + k0,
packed_fields + f * region_all,
shellf[f],
x, y, z,
interp_ordn,
vars[f]->SoA,
interp_sym);
}
}
else
{
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->fgfs[forx->sgfn], shellf[0],
x, y, z, interp_ordn, forx->SoA, interp_sym);
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->fgfs[fory->sgfn], shellf[1],
x, y, z, interp_ordn, fory->SoA, interp_sym);
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->fgfs[forz->sgfn], shellf[2],
x, y, z, interp_ordn, forz->SoA, interp_sym);
}
}
break;
}
if (BP == patch->ble)
break;
BP = BP->next;
}
if (owner_rank >= 0)
break;
PL = PL->next;
}
if (owner_rank < 0)
return false;
MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
return true;
}
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
{
MyList<Patch> *Pp = PatL;
@@ -3254,6 +3430,8 @@ void bssn_class::Step(int lev, int YN)
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
// bssn_class does not evolve matter source fields; they remain zero-initialized.
int use_zero_matter = 1;
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
@@ -3270,6 +3448,7 @@ void bssn_class::Step(int lev, int YN)
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, pre,
use_zero_matter,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA predictor substep failed in domain: ("
@@ -3573,12 +3752,8 @@ void bssn_class::Step(int lev, int YN)
#if USE_CUDA_BSSN
const bool need_analysis_state_after_predictor =
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
const bool need_bh_state_after_predictor =
(BH_num > 0) && (lev == GH->levels - 1);
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
else if (use_cuda_resident_sync && need_bh_state_after_predictor)
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
#endif
#ifdef WithShell
@@ -3687,6 +3862,8 @@ void bssn_class::Step(int lev, int YN)
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
// bssn_class does not evolve matter source fields; they remain zero-initialized.
int use_zero_matter = 1;
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
@@ -3705,6 +3882,7 @@ void bssn_class::Step(int lev, int YN)
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, cor,
use_zero_matter,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA corrector substep failed in domain: ("
@@ -4055,11 +4233,6 @@ void bssn_class::Step(int lev, int YN)
}
#endif
#if USE_CUDA_BSSN
if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
#endif
// swap time level
if (iter_count < 3)
{
@@ -7145,6 +7318,17 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
int lev = ilev;
#if USE_CUDA_BSSN
if (bssn_cuda_use_resident_sync(lev) &&
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
{
BH_RHS[n][0] = -shellf[0];
BH_RHS[n][1] = -shellf[1];
BH_RHS[n][2] = -shellf[2];
continue;
}
#endif
#if (PSTR == 0)
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)

View File

@@ -4665,6 +4665,13 @@ static void upload_matter_cache(StepContext &ctx,
ctx.matter_ready = true;
}
static void zero_matter_cache(StepContext &ctx, size_t all)
{
CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
(size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
ctx.matter_ready = true;
}
static void bind_matter_slots(const StepContext &ctx)
{
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
@@ -5630,6 +5637,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
int &Lev,
double &eps,
int &co,
int &use_zero_matter,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny)
@@ -5686,12 +5694,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
t0 = profile ? cuda_profile_now_ms() : 0.0;
if (RK4 == 0) {
upload_matter_cache(ctx, matter_host, all);
if (use_zero_matter) {
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
} else {
upload_matter_cache(ctx, matter_host, all);
}
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
} else if (!ctx.matter_ready) {
upload_matter_cache(ctx, matter_host, all);
if (use_zero_matter) zero_matter_cache(ctx, all);
else upload_matter_cache(ctx, matter_host, all);
}
bind_matter_slots(ctx);
if (profile) {

View File

@@ -50,6 +50,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
int &Lev,
double &eps,
int &co,
int &use_zero_matter,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);