Optimize BSSN-EM 8th-order AMR transfers

This commit is contained in:
2026-05-07 21:38:16 +08:00
parent dcc83bafcb
commit 1064a68d16
2 changed files with 241 additions and 0 deletions

View File

@@ -1426,6 +1426,231 @@ int cuda_data_packer_device_batched(double *data,
}
return size_out;
}
bool cuda_ensure_bssn_block_resident_for_pack(Block *block,
MyList<var> *vars,
int state_count,
std::vector<Block *> &uploaded)
{
if (!block)
return false;
if (bssn_cuda_has_resident_state(block) != 0)
return true;
for (size_t i = 0; i < uploaded.size(); ++i)
{
if (uploaded[i] == block)
return bssn_cuda_has_resident_state(block) != 0;
}
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
if (!cuda_build_bssn_host_views(block, vars, state_count, views))
return false;
if (bssn_cuda_upload_resident_state_count(block, block->shape, views, state_count) != 0)
return false;
uploaded.push_back(block);
return bssn_cuda_has_resident_state(block) != 0;
}
void cuda_host_batch_diag(const char *reason, int state_count, int type)
{
static int reported = 0;
const char *env = getenv("AMSS_CUDA_HOST_BATCH_DIAG");
if (!env || atoi(env) == 0 || reported >= 32)
return;
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stderr,
"[AMSS-CUDA-HOST-BATCH][rank %d] disabled reason=%s state_count=%d type=%d\n",
rank, reason ? reason : "unknown", state_count, type);
fflush(stderr);
reported++;
}
int cuda_transfer_active_length_if_batched_eligible(MyList<Parallel::gridseg> *src,
MyList<Parallel::gridseg> *dst,
int rank_in,
int dir,
MyList<var> *VarLists,
int state_count,
int myrank,
std::vector<Block *> &uploaded)
{
if (dir != PACK && dir != UNPACK)
{
cuda_host_batch_diag("bad_dir", state_count, -1);
return -1;
}
if (!cuda_device_segment_batch_enabled())
{
cuda_host_batch_diag("segment_batch_off", state_count, -1);
return -1;
}
if (!cuda_device_state_count_supported(state_count))
{
cuda_host_batch_diag("unsupported_state_count", state_count, -1);
return -1;
}
if (cuda_amr_restrict_compare_enabled())
{
cuda_host_batch_diag("compare_enabled", state_count, -1);
return -1;
}
int total = 0;
bool has_work = false;
bool has_amr = false;
while (src && dst)
{
const bool active =
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
if (active)
{
has_work = true;
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
{
cuda_host_batch_diag("null_segment", state_count, -1);
return -1;
}
int type;
if (src->data->Bg->lev == dst->data->Bg->lev)
type = 1;
else if (src->data->Bg->lev > dst->data->Bg->lev)
type = 2;
else
type = 3;
if (type == 2 || type == 3)
has_amr = true;
if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled())
{
cuda_host_batch_diag("restrict_batch_off", state_count, type);
return -1;
}
if (dir == PACK)
{
if ((type == 2 || type == 3) &&
!cuda_ensure_bssn_block_resident_for_pack(src->data->Bg, VarLists,
state_count, uploaded))
{
cuda_host_batch_diag("resident_upload_failed", state_count, type);
return -1;
}
if (!cuda_can_direct_pack(src->data, dst->data, type))
{
cuda_host_batch_diag("direct_pack_ineligible", state_count, type);
return -1;
}
}
else
{
if (!cuda_can_direct_unpack(dst->data, type))
{
cuda_host_batch_diag("direct_unpack_ineligible", state_count, type);
return -1;
}
}
total += state_count * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
}
src = src->next;
dst = dst->next;
}
if (!has_work)
return 0;
if (!has_amr)
{
cuda_host_batch_diag("no_amr_segment", state_count, -1);
return -1;
}
return total;
}
int cuda_data_packer_host_staged_batched(double *host_data,
MyList<Parallel::gridseg> *src,
MyList<Parallel::gridseg> *dst,
int rank_in,
int dir,
MyList<var> *VarLists,
MyList<var> *VarListd,
int Symmetry)
{
if (!host_data || !cuda_amr_host_staged_enabled())
{
cuda_host_batch_diag(!host_data ? "null_host_data" : "host_staged_off", -1, -1);
return -1;
}
int myrank;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
const int state_count = cuda_state_var_count(VarLists, VarListd);
if (state_count < 0)
{
cuda_host_batch_diag("var_list_mismatch", state_count, -1);
return -1;
}
std::vector<Block *> uploaded;
const int total = cuda_transfer_active_length_if_batched_eligible(src, dst, rank_in,
dir, VarLists,
state_count, myrank,
uploaded);
if (total <= 0)
return total;
static double *stage_dev = 0;
static int stage_cap = 0;
if (total > stage_cap)
{
free_device_comm_buffer(stage_dev);
stage_dev = alloc_device_comm_buffer(total);
stage_cap = total;
}
if (dir == UNPACK)
{
cudaError_t h2d = cudaMemcpy(stage_dev, host_data, (size_t)total * sizeof(double),
cudaMemcpyHostToDevice);
if (h2d != cudaSuccess)
{
fprintf(stderr, "Parallel: CUDA host-staged batched unpack cudaMemcpy failed, err=%d\n",
(int)h2d);
return -1;
}
}
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
const int packed = cuda_data_packer_device_batched(stage_dev, src, dst, rank_in, dir,
VarLists, VarListd, Symmetry);
if (packed != total)
{
cuda_host_batch_diag("device_batched_failed", state_count, -1);
return -1;
}
if (dir == PACK)
{
cudaError_t d2h = cudaMemcpy(host_data, stage_dev, (size_t)total * sizeof(double),
cudaMemcpyDeviceToHost);
if (d2h != cudaSuccess)
{
fprintf(stderr, "Parallel: CUDA host-staged batched pack cudaMemcpy failed, err=%d\n",
(int)d2h);
return -1;
}
}
if (sync_profile_enabled())
{
const double dt = MPI_Wtime() - t0;
if (dir == PACK)
sync_profile_stats().direct_pack_sec += dt;
else
sync_profile_stats().direct_unpack_sec += dt;
}
return total;
}
#endif
bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
@@ -5347,6 +5572,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
MPI_Abort(MPI_COMM_WORLD, 1);
}
#if USE_CUDA_BSSN
if (data && (dir == PACK || dir == UNPACK))
{
const int batched = cuda_data_packer_host_staged_batched(data, src, dst, rank_in, dir,
VarLists, VarListd, Symmetry);
if (batched >= 0)
return batched;
}
#endif
int type; /* 1 copy, 2 restrict, 3 prolong */
if (src->data->Bg->lev == dst->data->Bg->lev)
type = 1;

View File

@@ -167,6 +167,12 @@ def _gpu_runtime_env():
"AMSS_INTERP_GPU": "0",
"AMSS_CUDA_AWARE_MPI": "0",
})
if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM":
defaults.update({
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
"AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
})
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
if getattr(input_data, "Equation_Class", "") == "Z4C":