Add zero matter handling and interpolation for resident state in CUDA BSSN
This commit is contained in:
@@ -112,6 +112,182 @@ bool bssn_cuda_sync_bh_fields(Block *cg, var *forx, var *fory, var *forz, bool u
|
||||
return bssn_cuda_sync_subset(cg, 3, k_bssn_cuda_bh_state_indices, bh_fields, upload);
|
||||
}
|
||||
|
||||
bool bssn_cuda_patch_contains_point(Patch *patch, const double *point)
|
||||
{
|
||||
if (!patch)
|
||||
return false;
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
const double h = patch->getdX(d);
|
||||
const double lo = patch->bbox[d] + patch->lli[d] * h;
|
||||
const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
|
||||
if (point[d] < lo || point[d] > hi)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool bssn_cuda_point_in_block(Patch *patch, Block *block,
|
||||
const double *point, const double *DH)
|
||||
{
|
||||
if (!patch || !block)
|
||||
return false;
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
double llb;
|
||||
double uub;
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
|
||||
? block->bbox[d] + patch->lli[d] * DH[d]
|
||||
: block->bbox[d] + (ghost_width - 0.5) * DH[d];
|
||||
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
|
||||
? block->bbox[dim + d] - patch->uui[d] * DH[d]
|
||||
: block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
|
||||
? block->bbox[d] + patch->lli[d] * DH[d]
|
||||
: block->bbox[d] + ghost_width * DH[d];
|
||||
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
|
||||
? block->bbox[dim + d] - patch->uui[d] * DH[d]
|
||||
: block->bbox[dim + d] - ghost_width * DH[d];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int bssn_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
|
||||
{
|
||||
if (!coords || n <= ordn)
|
||||
return 0;
|
||||
int cxi = int((x - coords[0]) / dx + 0.4) + 1;
|
||||
int start = cxi - ordn / 2;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
const int max_start = n - ordn;
|
||||
if (start > max_start)
|
||||
start = max_start;
|
||||
return start;
|
||||
}
|
||||
|
||||
bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||
int myrank,
|
||||
const double *point,
|
||||
var *forx, var *fory, var *forz,
|
||||
int Symmetry,
|
||||
double *shellf)
|
||||
{
|
||||
const int ordn = 2 * ghost_width;
|
||||
int owner_rank = -1;
|
||||
|
||||
shellf[0] = shellf[1] = shellf[2] = 0.0;
|
||||
|
||||
MyList<Patch> *PL = PatL;
|
||||
while (PL)
|
||||
{
|
||||
Patch *patch = PL->data;
|
||||
if (!bssn_cuda_patch_contains_point(patch, point))
|
||||
{
|
||||
PL = PL->next;
|
||||
continue;
|
||||
}
|
||||
|
||||
double DH[dim];
|
||||
for (int d = 0; d < dim; d++)
|
||||
DH[d] = patch->getdX(d);
|
||||
|
||||
MyList<Block> *BP = patch->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *block = BP->data;
|
||||
if (bssn_cuda_point_in_block(patch, block, point, DH))
|
||||
{
|
||||
owner_rank = block->rank;
|
||||
if (myrank == owner_rank)
|
||||
{
|
||||
int interp_ordn = ordn;
|
||||
int interp_sym = Symmetry;
|
||||
double x = point[0];
|
||||
double y = point[1];
|
||||
double z = point[2];
|
||||
|
||||
if (bssn_cuda_has_resident_state(block) &&
|
||||
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
||||
{
|
||||
const int sx = ordn;
|
||||
const int sy = ordn;
|
||||
const int sz = ordn;
|
||||
const int region_all = sx * sy * sz;
|
||||
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
|
||||
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
|
||||
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
|
||||
double packed_fields[3 * region_all];
|
||||
var *vars[3] = {forx, fory, forz};
|
||||
for (int f = 0; f < 3; f++)
|
||||
{
|
||||
if (bssn_cuda_pack_state_region_to_host_buffer(block,
|
||||
k_bssn_cuda_bh_state_indices[f],
|
||||
packed_fields + f * region_all,
|
||||
block->shape,
|
||||
i0, j0, k0,
|
||||
sx, sy, sz) != 0)
|
||||
{
|
||||
cout << "CUDA BH tile download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int tile_shape[3] = {sx, sy, sz};
|
||||
f_global_interp(tile_shape,
|
||||
block->X[0] + i0,
|
||||
block->X[1] + j0,
|
||||
block->X[2] + k0,
|
||||
packed_fields + f * region_all,
|
||||
shellf[f],
|
||||
x, y, z,
|
||||
interp_ordn,
|
||||
vars[f]->SoA,
|
||||
interp_sym);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||
block->fgfs[forx->sgfn], shellf[0],
|
||||
x, y, z, interp_ordn, forx->SoA, interp_sym);
|
||||
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||
block->fgfs[fory->sgfn], shellf[1],
|
||||
x, y, z, interp_ordn, fory->SoA, interp_sym);
|
||||
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||
block->fgfs[forz->sgfn], shellf[2],
|
||||
x, y, z, interp_ordn, forz->SoA, interp_sym);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (BP == patch->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
|
||||
if (owner_rank >= 0)
|
||||
break;
|
||||
PL = PL->next;
|
||||
}
|
||||
|
||||
if (owner_rank < 0)
|
||||
return false;
|
||||
|
||||
MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
|
||||
return true;
|
||||
}
|
||||
|
||||
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
@@ -3254,6 +3430,8 @@ void bssn_class::Step(int lev, int YN)
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int apply_bam_bc = 0;
|
||||
// bssn_class does not evolve matter source fields; they remain zero-initialized.
|
||||
int use_zero_matter = 1;
|
||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
@@ -3270,6 +3448,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
propspeed, soa_flat, Pp->data->bbox,
|
||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, pre,
|
||||
use_zero_matter,
|
||||
keep_resident_state, apply_enforce_ga, chitiny))
|
||||
{
|
||||
cout << "CUDA predictor substep failed in domain: ("
|
||||
@@ -3573,12 +3752,8 @@ void bssn_class::Step(int lev, int YN)
|
||||
#if USE_CUDA_BSSN
|
||||
const bool need_analysis_state_after_predictor =
|
||||
(lev == a_lev) && (LastAnas + dT_lev >= AnasTime);
|
||||
const bool need_bh_state_after_predictor =
|
||||
(BH_num > 0) && (lev == GH->levels - 1);
|
||||
if (use_cuda_resident_sync && need_analysis_state_after_predictor)
|
||||
bssn_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
||||
else if (use_cuda_resident_sync && need_bh_state_after_predictor)
|
||||
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx, Sfy, Sfz);
|
||||
#endif
|
||||
|
||||
#ifdef WithShell
|
||||
@@ -3687,6 +3862,8 @@ void bssn_class::Step(int lev, int YN)
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int apply_bam_bc = 0;
|
||||
// bssn_class does not evolve matter source fields; they remain zero-initialized.
|
||||
int use_zero_matter = 1;
|
||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
@@ -3705,6 +3882,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
propspeed, soa_flat, Pp->data->bbox,
|
||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, cor,
|
||||
use_zero_matter,
|
||||
keep_resident_state, apply_enforce_ga, chitiny))
|
||||
{
|
||||
cout << "CUDA corrector substep failed in domain: ("
|
||||
@@ -4055,11 +4233,6 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (use_cuda_resident_sync && BH_num > 0 && lev == GH->levels - 1 && iter_count < 3)
|
||||
bssn_cuda_sync_level_bh_fields(GH->PatL[lev], myrank, Sfx1, Sfy1, Sfz1);
|
||||
#endif
|
||||
|
||||
// swap time level
|
||||
if (iter_count < 3)
|
||||
{
|
||||
@@ -7145,6 +7318,17 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
|
||||
|
||||
int lev = ilev;
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_use_resident_sync(lev) &&
|
||||
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
|
||||
{
|
||||
BH_RHS[n][0] = -shellf[0];
|
||||
BH_RHS[n][1] = -shellf[1];
|
||||
BH_RHS[n][2] = -shellf[2];
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (PSTR == 0)
|
||||
while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry))
|
||||
#elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
|
||||
|
||||
@@ -4665,6 +4665,13 @@ static void upload_matter_cache(StepContext &ctx,
|
||||
ctx.matter_ready = true;
|
||||
}
|
||||
|
||||
static void zero_matter_cache(StepContext &ctx, size_t all)
|
||||
{
|
||||
CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
|
||||
(size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
|
||||
ctx.matter_ready = true;
|
||||
}
|
||||
|
||||
static void bind_matter_slots(const StepContext &ctx)
|
||||
{
|
||||
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
||||
@@ -5630,6 +5637,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &co,
|
||||
int &use_zero_matter,
|
||||
int &keep_resident_state,
|
||||
int &apply_enforce_ga,
|
||||
double &chitiny)
|
||||
@@ -5686,12 +5694,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
|
||||
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||
if (RK4 == 0) {
|
||||
upload_matter_cache(ctx, matter_host, all);
|
||||
if (use_zero_matter) {
|
||||
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
|
||||
} else {
|
||||
upload_matter_cache(ctx, matter_host, all);
|
||||
}
|
||||
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
||||
(size_t)BSSN_STATE_COUNT * bytes,
|
||||
cudaMemcpyDeviceToDevice));
|
||||
} else if (!ctx.matter_ready) {
|
||||
upload_matter_cache(ctx, matter_host, all);
|
||||
if (use_zero_matter) zero_matter_cache(ctx, all);
|
||||
else upload_matter_cache(ctx, matter_host, all);
|
||||
}
|
||||
bind_matter_slots(ctx);
|
||||
if (profile) {
|
||||
|
||||
@@ -50,6 +50,7 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &co,
|
||||
int &use_zero_matter,
|
||||
int &keep_resident_state,
|
||||
int &apply_enforce_ga,
|
||||
double &chitiny);
|
||||
|
||||
Reference in New Issue
Block a user