Stabilize EScalar CUDA sync defaults

This commit is contained in:
2026-05-03 00:24:50 +08:00
parent 74ba5feb86
commit 4430d04ee7
6 changed files with 243 additions and 20 deletions

View File

@@ -1324,6 +1324,12 @@ int cuda_data_packer_device_batched(double *data,
while (src && dst)
{
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
{
src = src->next;
dst = dst->next;
continue;
}
const bool active =
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
@@ -1433,12 +1439,16 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
bool has_work = false;
while (src && dst)
{
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
{
src = src->next;
dst = dst->next;
continue;
}
if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
{
has_work = true;
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
return false;
int type;
if (src->data->Bg->lev == dst->data->Bg->lev)
type = 1;

View File

@@ -129,6 +129,17 @@ MyList<var> *clone_var_list_prefix(MyList<var> *src, int count)
return dst;
}
bool escalar_gpu_rk_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_GPU_RK");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void clear_var_list(MyList<var> *&list)
{
if (list)
@@ -175,6 +186,7 @@ int run_bssn_escalar_cuda_substep(Block *cg,
int &co,
double &chitiny,
var *Sphi_in, var *Spi_in,
var *Sphi_out, var *Spi_out,
var *Sphi_rhs, var *Spi_rhs,
var *rho, var *Sx, var *Sy, var *Sz,
var *Sxx, var *Sxy, var *Sxz,
@@ -220,6 +232,26 @@ int run_bssn_escalar_cuda_substep(Block *cg,
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
if (escalar_gpu_rk_enabled())
{
double scalar_propspeed[2] = {
Sphi_in->propspeed, Spi_in->propspeed
};
double scalar_soa[6] = {
Sphi_in->SoA[0], Sphi_in->SoA[1], Sphi_in->SoA[2],
Spi_in->SoA[0], Spi_in->SoA[1], Spi_in->SoA[2]
};
if (bssn_cuda_escalar_finalize_scalar_fields(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[Sphi_out->sgfn],
cg->fgfs[Spi_out->sgfn],
scalar_propspeed,
scalar_soa,
patch->bbox,
dT_lev, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, co))
return 1;
}
int use_zero_matter = 0;
int keep_resident_state = 1;
double **matter_precomputed = nullptr;
@@ -1003,7 +1035,7 @@ void bssnEScalar_class::Step(int lev, int YN)
(run_bssn_escalar_cuda_substep(cg, StateList, SynchList_pre, Pp->data,
dT_lev, TRK4, iter_count, Symmetry, lev,
ndeps, pre, chitiny,
Sphi0, Spi0, Sphi_rhs, Spi_rhs,
Sphi0, Spi0, Sphi, Spi, Sphi_rhs, Spi_rhs,
rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0))
? 0
: 1) ||
@@ -1058,9 +1090,34 @@ void bssnEScalar_class::Step(int lev, int YN)
#if USE_CUDA_BSSN
if (used_gpu_substep)
skip_bssn_cuda_prefix(varl0, varl, varlrhs);
#endif
const bool scalar_gpu_rk_done =
#if USE_CUDA_BSSN
used_gpu_substep && escalar_gpu_rk_enabled();
#else
false;
#endif
while (varl0)
{
if (scalar_gpu_rk_done)
{
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev, cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
varl0 = varl0->next;
varl = varl->next;
varlrhs = varlrhs->next;
continue;
}
#ifndef WithShell
if (lev == 0) // sommerfeld indeed
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
@@ -1369,7 +1426,7 @@ void bssnEScalar_class::Step(int lev, int YN)
(run_bssn_escalar_cuda_substep(cg, SynchList_pre, SynchList_cor, Pp->data,
dT_lev, TRK4, iter_count, Symmetry, lev,
ndeps, cor, chitiny,
Sphi, Spi, Sphi_rhs, Spi_rhs,
Sphi, Spi, Sphi1, Spi1, Sphi_rhs, Spi_rhs,
rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0))
? 0
: 1) ||
@@ -1426,9 +1483,35 @@ void bssnEScalar_class::Step(int lev, int YN)
if (used_gpu_substep)
skip_bssn_cuda_prefix(varl0, varl, varl1, varlrhs);
#endif
const bool scalar_gpu_rk_done =
#if USE_CUDA_BSSN
used_gpu_substep && escalar_gpu_rk_enabled();
#else
false;
#endif
while (varl0)
{
if (scalar_gpu_rk_done)
{
#ifndef WithShell
if (lev > 0) // fix BD point
#endif
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
dT_lev, cg->fgfs[phi0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
varl0->data->SoA,
Symmetry, cor);
varl0 = varl0->next;
varl = varl->next;
varl1 = varl1->next;
varlrhs = varlrhs->next;
continue;
}
#ifndef WithShell
if (lev == 0) // sommerfeld indeed
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],

View File

@@ -393,7 +393,8 @@ static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20};
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = nullptr,
double *soa_flat = nullptr)
double *soa_flat = nullptr,
bool allow_trailing_vars = false)
{
int idx = 0;
while (vars && idx < BSSN_CUDA_STATE_COUNT)
@@ -410,7 +411,7 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
vars = vars->next;
++idx;
}
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
return idx == BSSN_CUDA_STATE_COUNT && (allow_trailing_vars || vars == 0);
}
bool bssn_cuda_use_resident_sync(int lev)
@@ -687,7 +688,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out))
if (!fill_bssn_cuda_views(cg, vars, state_out, nullptr, nullptr, true))
{
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -720,7 +721,7 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out))
if (!fill_bssn_cuda_views(cg, vars, state_out, nullptr, nullptr, true))
{
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);

View File

@@ -203,7 +203,16 @@ static bool escalar_host_pin_enabled() {
static int enabled = -1;
if (enabled < 0) {
const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
enabled = (!env || atoi(env) != 0) ? 1 : 0;
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
static bool escalar_gpu_rk_enabled() {
static int enabled = -1;
if (enabled < 0) {
const char *env = getenv("AMSS_ESCALAR_GPU_RK");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
@@ -588,6 +597,8 @@ static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
struct StepContext {
double *d_state0_mem;
double *d_accum_mem;
double *d_escalar0_mem;
double *d_escalar_accum_mem;
double *d_state_curr_mem;
double *d_state_next_mem;
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
@@ -596,6 +607,8 @@ struct StepContext {
double *h_comm_mem;
std::array<double *, BSSN_STATE_COUNT> d_state0;
std::array<double *, BSSN_STATE_COUNT> d_accum;
std::array<double *, 2> d_escalar0;
std::array<double *, 2> d_escalar_accum;
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
std::array<double *, BSSN_STATE_COUNT> d_state_next;
std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
@@ -615,6 +628,7 @@ struct StepContext {
StepContext()
: d_state0_mem(nullptr), d_accum_mem(nullptr),
d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr),
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
d_resident_mem{},
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
@@ -625,6 +639,8 @@ struct StepContext {
d_resident_mem.fill(nullptr);
d_state0.fill(nullptr);
d_accum.fill(nullptr);
d_escalar0.fill(nullptr);
d_escalar_accum.fill(nullptr);
d_state_curr.fill(nullptr);
d_state_next.fill(nullptr);
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -641,6 +657,8 @@ struct StepContext {
struct StepAllocation {
double *d_state0_mem;
double *d_accum_mem;
double *d_escalar0_mem;
double *d_escalar_accum_mem;
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
double *d_matter_mem;
double *d_comm_mem;
@@ -661,6 +679,8 @@ static StepAllocation empty_step_allocation()
StepAllocation alloc = {};
alloc.d_state0_mem = nullptr;
alloc.d_accum_mem = nullptr;
alloc.d_escalar0_mem = nullptr;
alloc.d_escalar_accum_mem = nullptr;
alloc.d_resident_mem.fill(nullptr);
alloc.d_matter_mem = nullptr;
alloc.d_comm_mem = nullptr;
@@ -682,6 +702,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
StepAllocation alloc = {};
alloc.d_state0_mem = ctx.d_state0_mem;
alloc.d_accum_mem = ctx.d_accum_mem;
alloc.d_escalar0_mem = ctx.d_escalar0_mem;
alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem;
alloc.d_resident_mem = ctx.d_resident_mem;
alloc.d_matter_mem = ctx.d_matter_mem;
alloc.d_comm_mem = ctx.d_comm_mem;
@@ -692,6 +714,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
alloc.cap_h_comm = ctx.cap_h_comm;
ctx.d_state0_mem = nullptr;
ctx.d_accum_mem = nullptr;
ctx.d_escalar0_mem = nullptr;
ctx.d_escalar_accum_mem = nullptr;
ctx.d_state_curr_mem = nullptr;
ctx.d_state_next_mem = nullptr;
ctx.d_resident_mem.fill(nullptr);
@@ -708,6 +732,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
ctx.resident_clock = 0;
ctx.d_state0.fill(nullptr);
ctx.d_accum.fill(nullptr);
ctx.d_escalar0.fill(nullptr);
ctx.d_escalar_accum.fill(nullptr);
ctx.d_state_curr.fill(nullptr);
ctx.d_state_next.fill(nullptr);
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
@@ -725,6 +751,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
{
ctx.d_state0_mem = alloc.d_state0_mem;
ctx.d_accum_mem = alloc.d_accum_mem;
ctx.d_escalar0_mem = alloc.d_escalar0_mem;
ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem;
ctx.d_resident_mem = alloc.d_resident_mem;
ctx.d_state_curr_mem = nullptr;
ctx.d_state_next_mem = nullptr;
@@ -849,6 +877,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
}
}
if (ctx.d_escalar0_mem && ctx.d_escalar_accum_mem) {
for (int i = 0; i < 2; ++i) {
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
}
}
if (ctx.current_bank >= 0) {
ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
@@ -859,6 +893,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
return ctx;
}
static void ensure_escalar_buffers(StepContext &ctx, size_t all)
{
if (!ctx.d_escalar0_mem)
CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double)));
if (!ctx.d_escalar_accum_mem)
CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double)));
for (int i = 0; i < 2; ++i) {
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
}
}
static void release_step_ctx(void *block_tag)
{
auto it = g_step_ctx.find(block_tag);
@@ -7113,14 +7159,78 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
a2);
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
if (!escalar_gpu_rk_enabled()) {
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
}
ctx.matter_ready = true;
(void)Lev;
return 0;
}
extern "C"
int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double *Sphi_out_host,
double *Spi_out_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &precor)
{
if (!escalar_gpu_rk_enabled())
return 1;
if (RK4 < 0 || RK4 > 3)
return 1;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
const size_t bytes = all * sizeof(double);
setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor);
StepContext &ctx = ensure_step_ctx(block_tag, all);
ensure_escalar_buffers(ctx, all);
if (RK4 == 0) {
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[0], g_buf.slot[S_S_arr],
bytes, cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[1], g_buf.slot[S_f_arr],
bytes, cudaMemcpyDeviceToDevice));
}
if (apply_bam_bc) {
gpu_sommerfeld_routbam(g_buf.slot[S_S_arr], g_buf.slot[S_Gamxa],
propspeed[0],
soa_flat[0], soa_flat[1], soa_flat[2],
X, Y, Z, bbox, Symmetry);
gpu_sommerfeld_routbam(g_buf.slot[S_f_arr], g_buf.slot[S_Gamya],
propspeed[1],
soa_flat[3], soa_flat[4], soa_flat[5],
X, Y, Z, bbox, Symmetry);
}
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[0], g_buf.slot[S_Gamxa],
ctx.d_escalar_accum[0], dT, RK4);
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya],
ctx.d_escalar_accum[1], dT, RK4);
try_pin_escalar_host_buffer(Sphi_out_host, bytes);
try_pin_escalar_host_buffer(Spi_out_host, bytes);
CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaDeviceSynchronize());
(void)Lev;
return 0;
}
extern "C"
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,

View File

@@ -69,6 +69,21 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
int &co,
int &apply_enforce_ga);
int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double *Sphi_out_host,
double *Spi_out_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &precor);
int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index,
double *host_state,

View File

@@ -151,6 +151,8 @@ def _gpu_runtime_env():
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
"AMSS_CUDA_PIN_ESCALAR_TRANSFERS": "0",
"AMSS_ESCALAR_GPU_RK": "0",
}
if getattr(input_data, "Equation_Class", "") == "Z4C":
defaults["AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP"] = "0"
@@ -287,6 +289,8 @@ def run_ABE():
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
print(f" AMSS_CUDA_PIN_ESCALAR_TRANSFERS={mpi_env.get('AMSS_CUDA_PIN_ESCALAR_TRANSFERS', '')}")
print(f" AMSS_ESCALAR_GPU_RK={mpi_env.get('AMSS_ESCALAR_GPU_RK', '')}")
if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
print(f" CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")