Optimize BSSN-EM 8th-order AMR transfers
This commit is contained in:
@@ -1426,6 +1426,231 @@ int cuda_data_packer_device_batched(double *data,
|
||||
}
|
||||
return size_out;
|
||||
}
|
||||
|
||||
bool cuda_ensure_bssn_block_resident_for_pack(Block *block,
|
||||
MyList<var> *vars,
|
||||
int state_count,
|
||||
std::vector<Block *> &uploaded)
|
||||
{
|
||||
if (!block)
|
||||
return false;
|
||||
if (bssn_cuda_has_resident_state(block) != 0)
|
||||
return true;
|
||||
for (size_t i = 0; i < uploaded.size(); ++i)
|
||||
{
|
||||
if (uploaded[i] == block)
|
||||
return bssn_cuda_has_resident_state(block) != 0;
|
||||
}
|
||||
|
||||
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
|
||||
if (!cuda_build_bssn_host_views(block, vars, state_count, views))
|
||||
return false;
|
||||
if (bssn_cuda_upload_resident_state_count(block, block->shape, views, state_count) != 0)
|
||||
return false;
|
||||
uploaded.push_back(block);
|
||||
return bssn_cuda_has_resident_state(block) != 0;
|
||||
}
|
||||
|
||||
void cuda_host_batch_diag(const char *reason, int state_count, int type)
|
||||
{
|
||||
static int reported = 0;
|
||||
const char *env = getenv("AMSS_CUDA_HOST_BATCH_DIAG");
|
||||
if (!env || atoi(env) == 0 || reported >= 32)
|
||||
return;
|
||||
int rank = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
fprintf(stderr,
|
||||
"[AMSS-CUDA-HOST-BATCH][rank %d] disabled reason=%s state_count=%d type=%d\n",
|
||||
rank, reason ? reason : "unknown", state_count, type);
|
||||
fflush(stderr);
|
||||
reported++;
|
||||
}
|
||||
|
||||
int cuda_transfer_active_length_if_batched_eligible(MyList<Parallel::gridseg> *src,
|
||||
MyList<Parallel::gridseg> *dst,
|
||||
int rank_in,
|
||||
int dir,
|
||||
MyList<var> *VarLists,
|
||||
int state_count,
|
||||
int myrank,
|
||||
std::vector<Block *> &uploaded)
|
||||
{
|
||||
if (dir != PACK && dir != UNPACK)
|
||||
{
|
||||
cuda_host_batch_diag("bad_dir", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
if (!cuda_device_segment_batch_enabled())
|
||||
{
|
||||
cuda_host_batch_diag("segment_batch_off", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
if (!cuda_device_state_count_supported(state_count))
|
||||
{
|
||||
cuda_host_batch_diag("unsupported_state_count", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
if (cuda_amr_restrict_compare_enabled())
|
||||
{
|
||||
cuda_host_batch_diag("compare_enabled", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int total = 0;
|
||||
bool has_work = false;
|
||||
bool has_amr = false;
|
||||
while (src && dst)
|
||||
{
|
||||
const bool active =
|
||||
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
|
||||
if (active)
|
||||
{
|
||||
has_work = true;
|
||||
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
|
||||
{
|
||||
cuda_host_batch_diag("null_segment", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
int type;
|
||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||
type = 1;
|
||||
else if (src->data->Bg->lev > dst->data->Bg->lev)
|
||||
type = 2;
|
||||
else
|
||||
type = 3;
|
||||
|
||||
if (type == 2 || type == 3)
|
||||
has_amr = true;
|
||||
if (dir == PACK && type == 2 && !cuda_amr_restrict_batch_enabled())
|
||||
{
|
||||
cuda_host_batch_diag("restrict_batch_off", state_count, type);
|
||||
return -1;
|
||||
}
|
||||
if (dir == PACK)
|
||||
{
|
||||
if ((type == 2 || type == 3) &&
|
||||
!cuda_ensure_bssn_block_resident_for_pack(src->data->Bg, VarLists,
|
||||
state_count, uploaded))
|
||||
{
|
||||
cuda_host_batch_diag("resident_upload_failed", state_count, type);
|
||||
return -1;
|
||||
}
|
||||
if (!cuda_can_direct_pack(src->data, dst->data, type))
|
||||
{
|
||||
cuda_host_batch_diag("direct_pack_ineligible", state_count, type);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!cuda_can_direct_unpack(dst->data, type))
|
||||
{
|
||||
cuda_host_batch_diag("direct_unpack_ineligible", state_count, type);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
total += state_count * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2];
|
||||
}
|
||||
src = src->next;
|
||||
dst = dst->next;
|
||||
}
|
||||
if (!has_work)
|
||||
return 0;
|
||||
if (!has_amr)
|
||||
{
|
||||
cuda_host_batch_diag("no_amr_segment", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
int cuda_data_packer_host_staged_batched(double *host_data,
|
||||
MyList<Parallel::gridseg> *src,
|
||||
MyList<Parallel::gridseg> *dst,
|
||||
int rank_in,
|
||||
int dir,
|
||||
MyList<var> *VarLists,
|
||||
MyList<var> *VarListd,
|
||||
int Symmetry)
|
||||
{
|
||||
if (!host_data || !cuda_amr_host_staged_enabled())
|
||||
{
|
||||
cuda_host_batch_diag(!host_data ? "null_host_data" : "host_staged_off", -1, -1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int myrank;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
|
||||
const int state_count = cuda_state_var_count(VarLists, VarListd);
|
||||
if (state_count < 0)
|
||||
{
|
||||
cuda_host_batch_diag("var_list_mismatch", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<Block *> uploaded;
|
||||
const int total = cuda_transfer_active_length_if_batched_eligible(src, dst, rank_in,
|
||||
dir, VarLists,
|
||||
state_count, myrank,
|
||||
uploaded);
|
||||
if (total <= 0)
|
||||
return total;
|
||||
|
||||
static double *stage_dev = 0;
|
||||
static int stage_cap = 0;
|
||||
if (total > stage_cap)
|
||||
{
|
||||
free_device_comm_buffer(stage_dev);
|
||||
stage_dev = alloc_device_comm_buffer(total);
|
||||
stage_cap = total;
|
||||
}
|
||||
|
||||
if (dir == UNPACK)
|
||||
{
|
||||
cudaError_t h2d = cudaMemcpy(stage_dev, host_data, (size_t)total * sizeof(double),
|
||||
cudaMemcpyHostToDevice);
|
||||
if (h2d != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr, "Parallel: CUDA host-staged batched unpack cudaMemcpy failed, err=%d\n",
|
||||
(int)h2d);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
const int packed = cuda_data_packer_device_batched(stage_dev, src, dst, rank_in, dir,
|
||||
VarLists, VarListd, Symmetry);
|
||||
if (packed != total)
|
||||
{
|
||||
cuda_host_batch_diag("device_batched_failed", state_count, -1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dir == PACK)
|
||||
{
|
||||
cudaError_t d2h = cudaMemcpy(host_data, stage_dev, (size_t)total * sizeof(double),
|
||||
cudaMemcpyDeviceToHost);
|
||||
if (d2h != cudaSuccess)
|
||||
{
|
||||
fprintf(stderr, "Parallel: CUDA host-staged batched pack cudaMemcpy failed, err=%d\n",
|
||||
(int)d2h);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
if (sync_profile_enabled())
|
||||
{
|
||||
const double dt = MPI_Wtime() - t0;
|
||||
if (dir == PACK)
|
||||
sync_profile_stats().direct_pack_sec += dt;
|
||||
else
|
||||
sync_profile_stats().direct_unpack_sec += dt;
|
||||
}
|
||||
return total;
|
||||
}
|
||||
#endif
|
||||
|
||||
bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
||||
@@ -5347,6 +5572,16 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (data && (dir == PACK || dir == UNPACK))
|
||||
{
|
||||
const int batched = cuda_data_packer_host_staged_batched(data, src, dst, rank_in, dir,
|
||||
VarLists, VarListd, Symmetry);
|
||||
if (batched >= 0)
|
||||
return batched;
|
||||
}
|
||||
#endif
|
||||
|
||||
int type; /* 1 copy, 2 restrict, 3 prolong */
|
||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||
type = 1;
|
||||
|
||||
@@ -167,6 +167,12 @@ def _gpu_runtime_env():
|
||||
"AMSS_INTERP_GPU": "0",
|
||||
"AMSS_CUDA_AWARE_MPI": "0",
|
||||
})
|
||||
if finite_difference == "8th-order" and getattr(input_data, "Equation_Class", "") == "BSSN-EM":
|
||||
defaults.update({
|
||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
|
||||
"AMSS_CUDA_AMR_RESTRICT_BATCH": "1",
|
||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "1",
|
||||
})
|
||||
if getattr(input_data, "Equation_Class", "") in ("BSSN", "BSSN-EScalar", "Z4C"):
|
||||
defaults["AMSS_CUDA_AMR_RESTRICT_DEVICE"] = "1"
|
||||
if getattr(input_data, "Equation_Class", "") == "Z4C":
|
||||
|
||||
Reference in New Issue
Block a user