Stabilize EScalar CUDA sync defaults
This commit is contained in:
@@ -1324,6 +1324,12 @@ int cuda_data_packer_device_batched(double *data,
|
||||
|
||||
while (src && dst)
|
||||
{
|
||||
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
|
||||
{
|
||||
src = src->next;
|
||||
dst = dst->next;
|
||||
continue;
|
||||
}
|
||||
const bool active =
|
||||
(dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank);
|
||||
@@ -1433,12 +1439,16 @@ bool cuda_segments_device_eligible(MyList<Parallel::gridseg> *src,
|
||||
bool has_work = false;
|
||||
while (src && dst)
|
||||
{
|
||||
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
|
||||
{
|
||||
src = src->next;
|
||||
dst = dst->next;
|
||||
continue;
|
||||
}
|
||||
if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) ||
|
||||
(dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank))
|
||||
{
|
||||
has_work = true;
|
||||
if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg)
|
||||
return false;
|
||||
int type;
|
||||
if (src->data->Bg->lev == dst->data->Bg->lev)
|
||||
type = 1;
|
||||
|
||||
@@ -129,6 +129,17 @@ MyList<var> *clone_var_list_prefix(MyList<var> *src, int count)
|
||||
return dst;
|
||||
}
|
||||
|
||||
bool escalar_gpu_rk_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_ESCALAR_GPU_RK");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
void clear_var_list(MyList<var> *&list)
|
||||
{
|
||||
if (list)
|
||||
@@ -175,6 +186,7 @@ int run_bssn_escalar_cuda_substep(Block *cg,
|
||||
int &co,
|
||||
double &chitiny,
|
||||
var *Sphi_in, var *Spi_in,
|
||||
var *Sphi_out, var *Spi_out,
|
||||
var *Sphi_rhs, var *Spi_rhs,
|
||||
var *rho, var *Sx, var *Sy, var *Sz,
|
||||
var *Sxx, var *Sxy, var *Sxz,
|
||||
@@ -220,6 +232,26 @@ int run_bssn_escalar_cuda_substep(Block *cg,
|
||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||
#endif
|
||||
#endif
|
||||
if (escalar_gpu_rk_enabled())
|
||||
{
|
||||
double scalar_propspeed[2] = {
|
||||
Sphi_in->propspeed, Spi_in->propspeed
|
||||
};
|
||||
double scalar_soa[6] = {
|
||||
Sphi_in->SoA[0], Sphi_in->SoA[1], Sphi_in->SoA[2],
|
||||
Spi_in->SoA[0], Spi_in->SoA[1], Spi_in->SoA[2]
|
||||
};
|
||||
if (bssn_cuda_escalar_finalize_scalar_fields(cg,
|
||||
cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
cg->fgfs[Sphi_out->sgfn],
|
||||
cg->fgfs[Spi_out->sgfn],
|
||||
scalar_propspeed,
|
||||
scalar_soa,
|
||||
patch->bbox,
|
||||
dT_lev, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, co))
|
||||
return 1;
|
||||
}
|
||||
int use_zero_matter = 0;
|
||||
int keep_resident_state = 1;
|
||||
double **matter_precomputed = nullptr;
|
||||
@@ -1003,7 +1035,7 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
(run_bssn_escalar_cuda_substep(cg, StateList, SynchList_pre, Pp->data,
|
||||
dT_lev, TRK4, iter_count, Symmetry, lev,
|
||||
ndeps, pre, chitiny,
|
||||
Sphi0, Spi0, Sphi_rhs, Spi_rhs,
|
||||
Sphi0, Spi0, Sphi, Spi, Sphi_rhs, Spi_rhs,
|
||||
rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0))
|
||||
? 0
|
||||
: 1) ||
|
||||
@@ -1058,9 +1090,34 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
#if USE_CUDA_BSSN
|
||||
if (used_gpu_substep)
|
||||
skip_bssn_cuda_prefix(varl0, varl, varlrhs);
|
||||
#endif
|
||||
const bool scalar_gpu_rk_done =
|
||||
#if USE_CUDA_BSSN
|
||||
used_gpu_substep && escalar_gpu_rk_enabled();
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
while (varl0)
|
||||
{
|
||||
if (scalar_gpu_rk_done)
|
||||
{
|
||||
#ifndef WithShell
|
||||
if (lev > 0) // fix BD point
|
||||
#endif
|
||||
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
|
||||
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
|
||||
dT_lev, cg->fgfs[phi0->sgfn],
|
||||
cg->fgfs[Lap0->sgfn],
|
||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
|
||||
varl0->data->SoA,
|
||||
Symmetry, cor);
|
||||
|
||||
varl0 = varl0->next;
|
||||
varl = varl->next;
|
||||
varlrhs = varlrhs->next;
|
||||
continue;
|
||||
}
|
||||
#ifndef WithShell
|
||||
if (lev == 0) // sommerfeld indeed
|
||||
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
@@ -1369,7 +1426,7 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
(run_bssn_escalar_cuda_substep(cg, SynchList_pre, SynchList_cor, Pp->data,
|
||||
dT_lev, TRK4, iter_count, Symmetry, lev,
|
||||
ndeps, cor, chitiny,
|
||||
Sphi, Spi, Sphi_rhs, Spi_rhs,
|
||||
Sphi, Spi, Sphi1, Spi1, Sphi_rhs, Spi_rhs,
|
||||
rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz) == 0))
|
||||
? 0
|
||||
: 1) ||
|
||||
@@ -1426,9 +1483,35 @@ void bssnEScalar_class::Step(int lev, int YN)
|
||||
if (used_gpu_substep)
|
||||
skip_bssn_cuda_prefix(varl0, varl, varl1, varlrhs);
|
||||
#endif
|
||||
const bool scalar_gpu_rk_done =
|
||||
#if USE_CUDA_BSSN
|
||||
used_gpu_substep && escalar_gpu_rk_enabled();
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
|
||||
while (varl0)
|
||||
{
|
||||
if (scalar_gpu_rk_done)
|
||||
{
|
||||
#ifndef WithShell
|
||||
if (lev > 0) // fix BD point
|
||||
#endif
|
||||
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
|
||||
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
|
||||
dT_lev, cg->fgfs[phi0->sgfn],
|
||||
cg->fgfs[Lap0->sgfn],
|
||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
|
||||
varl0->data->SoA,
|
||||
Symmetry, cor);
|
||||
|
||||
varl0 = varl0->next;
|
||||
varl = varl->next;
|
||||
varl1 = varl1->next;
|
||||
varlrhs = varlrhs->next;
|
||||
continue;
|
||||
}
|
||||
#ifndef WithShell
|
||||
if (lev == 0) // sommerfeld indeed
|
||||
f_sommerfeld_routbam(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
|
||||
@@ -393,7 +393,8 @@ static const int k_bssn_cuda_bh_state_indices[3] = {18, 19, 20};
|
||||
bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
||||
double **host_views,
|
||||
double *propspeeds = nullptr,
|
||||
double *soa_flat = nullptr)
|
||||
double *soa_flat = nullptr,
|
||||
bool allow_trailing_vars = false)
|
||||
{
|
||||
int idx = 0;
|
||||
while (vars && idx < BSSN_CUDA_STATE_COUNT)
|
||||
@@ -410,7 +411,7 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
||||
vars = vars->next;
|
||||
++idx;
|
||||
}
|
||||
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
|
||||
return idx == BSSN_CUDA_STATE_COUNT && (allow_trailing_vars || vars == 0);
|
||||
}
|
||||
|
||||
bool bssn_cuda_use_resident_sync(int lev)
|
||||
@@ -687,7 +688,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out, nullptr, nullptr, true))
|
||||
{
|
||||
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
@@ -720,7 +721,7 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out, nullptr, nullptr, true))
|
||||
{
|
||||
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
|
||||
@@ -203,7 +203,16 @@ static bool escalar_host_pin_enabled() {
|
||||
static int enabled = -1;
|
||||
if (enabled < 0) {
|
||||
const char *env = getenv("AMSS_CUDA_PIN_ESCALAR_TRANSFERS");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
static bool escalar_gpu_rk_enabled() {
|
||||
static int enabled = -1;
|
||||
if (enabled < 0) {
|
||||
const char *env = getenv("AMSS_ESCALAR_GPU_RK");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
@@ -588,6 +597,8 @@ static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
|
||||
struct StepContext {
|
||||
double *d_state0_mem;
|
||||
double *d_accum_mem;
|
||||
double *d_escalar0_mem;
|
||||
double *d_escalar_accum_mem;
|
||||
double *d_state_curr_mem;
|
||||
double *d_state_next_mem;
|
||||
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
|
||||
@@ -596,6 +607,8 @@ struct StepContext {
|
||||
double *h_comm_mem;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state0;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_accum;
|
||||
std::array<double *, 2> d_escalar0;
|
||||
std::array<double *, 2> d_escalar_accum;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state_curr;
|
||||
std::array<double *, BSSN_STATE_COUNT> d_state_next;
|
||||
std::array<std::array<double *, BSSN_STATE_COUNT>, BSSN_RESIDENT_BANK_COUNT> d_resident;
|
||||
@@ -615,6 +628,7 @@ struct StepContext {
|
||||
|
||||
StepContext()
|
||||
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
||||
d_escalar0_mem(nullptr), d_escalar_accum_mem(nullptr),
|
||||
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
||||
d_resident_mem{},
|
||||
d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
|
||||
@@ -625,6 +639,8 @@ struct StepContext {
|
||||
d_resident_mem.fill(nullptr);
|
||||
d_state0.fill(nullptr);
|
||||
d_accum.fill(nullptr);
|
||||
d_escalar0.fill(nullptr);
|
||||
d_escalar_accum.fill(nullptr);
|
||||
d_state_curr.fill(nullptr);
|
||||
d_state_next.fill(nullptr);
|
||||
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
||||
@@ -641,6 +657,8 @@ struct StepContext {
|
||||
struct StepAllocation {
|
||||
double *d_state0_mem;
|
||||
double *d_accum_mem;
|
||||
double *d_escalar0_mem;
|
||||
double *d_escalar_accum_mem;
|
||||
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
|
||||
double *d_matter_mem;
|
||||
double *d_comm_mem;
|
||||
@@ -661,6 +679,8 @@ static StepAllocation empty_step_allocation()
|
||||
StepAllocation alloc = {};
|
||||
alloc.d_state0_mem = nullptr;
|
||||
alloc.d_accum_mem = nullptr;
|
||||
alloc.d_escalar0_mem = nullptr;
|
||||
alloc.d_escalar_accum_mem = nullptr;
|
||||
alloc.d_resident_mem.fill(nullptr);
|
||||
alloc.d_matter_mem = nullptr;
|
||||
alloc.d_comm_mem = nullptr;
|
||||
@@ -682,6 +702,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
||||
StepAllocation alloc = {};
|
||||
alloc.d_state0_mem = ctx.d_state0_mem;
|
||||
alloc.d_accum_mem = ctx.d_accum_mem;
|
||||
alloc.d_escalar0_mem = ctx.d_escalar0_mem;
|
||||
alloc.d_escalar_accum_mem = ctx.d_escalar_accum_mem;
|
||||
alloc.d_resident_mem = ctx.d_resident_mem;
|
||||
alloc.d_matter_mem = ctx.d_matter_mem;
|
||||
alloc.d_comm_mem = ctx.d_comm_mem;
|
||||
@@ -692,6 +714,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
||||
alloc.cap_h_comm = ctx.cap_h_comm;
|
||||
ctx.d_state0_mem = nullptr;
|
||||
ctx.d_accum_mem = nullptr;
|
||||
ctx.d_escalar0_mem = nullptr;
|
||||
ctx.d_escalar_accum_mem = nullptr;
|
||||
ctx.d_state_curr_mem = nullptr;
|
||||
ctx.d_state_next_mem = nullptr;
|
||||
ctx.d_resident_mem.fill(nullptr);
|
||||
@@ -708,6 +732,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
|
||||
ctx.resident_clock = 0;
|
||||
ctx.d_state0.fill(nullptr);
|
||||
ctx.d_accum.fill(nullptr);
|
||||
ctx.d_escalar0.fill(nullptr);
|
||||
ctx.d_escalar_accum.fill(nullptr);
|
||||
ctx.d_state_curr.fill(nullptr);
|
||||
ctx.d_state_next.fill(nullptr);
|
||||
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
||||
@@ -725,6 +751,8 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
|
||||
{
|
||||
ctx.d_state0_mem = alloc.d_state0_mem;
|
||||
ctx.d_accum_mem = alloc.d_accum_mem;
|
||||
ctx.d_escalar0_mem = alloc.d_escalar0_mem;
|
||||
ctx.d_escalar_accum_mem = alloc.d_escalar_accum_mem;
|
||||
ctx.d_resident_mem = alloc.d_resident_mem;
|
||||
ctx.d_state_curr_mem = nullptr;
|
||||
ctx.d_state_next_mem = nullptr;
|
||||
@@ -849,6 +877,12 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
||||
ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
|
||||
}
|
||||
}
|
||||
if (ctx.d_escalar0_mem && ctx.d_escalar_accum_mem) {
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
|
||||
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
|
||||
}
|
||||
}
|
||||
if (ctx.current_bank >= 0) {
|
||||
ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
|
||||
ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
|
||||
@@ -859,6 +893,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
||||
return ctx;
|
||||
}
|
||||
|
||||
static void ensure_escalar_buffers(StepContext &ctx, size_t all)
|
||||
{
|
||||
if (!ctx.d_escalar0_mem)
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_escalar0_mem, 2 * ctx.cap_all * sizeof(double)));
|
||||
if (!ctx.d_escalar_accum_mem)
|
||||
CUDA_CHECK(cudaMalloc(&ctx.d_escalar_accum_mem, 2 * ctx.cap_all * sizeof(double)));
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
ctx.d_escalar0[i] = ctx.d_escalar0_mem + (size_t)i * all;
|
||||
ctx.d_escalar_accum[i] = ctx.d_escalar_accum_mem + (size_t)i * all;
|
||||
}
|
||||
}
|
||||
|
||||
static void release_step_ctx(void *block_tag)
|
||||
{
|
||||
auto it = g_step_ctx.find(block_tag);
|
||||
@@ -7113,14 +7159,78 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
|
||||
ctx.d_matter[4], ctx.d_matter[5], ctx.d_matter[6],
|
||||
ctx.d_matter[7], ctx.d_matter[8], ctx.d_matter[9],
|
||||
a2);
|
||||
if (!escalar_gpu_rk_enabled()) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(Sphi_rhs_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpyAsync(Spi_rhs_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
}
|
||||
ctx.matter_ready = true;
|
||||
(void)Lev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double *Sphi_out_host,
|
||||
double *Spi_out_host,
|
||||
const double *propspeed,
|
||||
const double *soa_flat,
|
||||
const double *bbox,
|
||||
double &dT,
|
||||
int &RK4,
|
||||
int &apply_bam_bc,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &precor)
|
||||
{
|
||||
if (!escalar_gpu_rk_enabled())
|
||||
return 1;
|
||||
if (RK4 < 0 || RK4 > 3)
|
||||
return 1;
|
||||
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const size_t bytes = all * sizeof(double);
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, precor);
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||
ensure_escalar_buffers(ctx, all);
|
||||
|
||||
if (RK4 == 0) {
|
||||
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[0], g_buf.slot[S_S_arr],
|
||||
bytes, cudaMemcpyDeviceToDevice));
|
||||
CUDA_CHECK(cudaMemcpyAsync(ctx.d_escalar0[1], g_buf.slot[S_f_arr],
|
||||
bytes, cudaMemcpyDeviceToDevice));
|
||||
}
|
||||
|
||||
if (apply_bam_bc) {
|
||||
gpu_sommerfeld_routbam(g_buf.slot[S_S_arr], g_buf.slot[S_Gamxa],
|
||||
propspeed[0],
|
||||
soa_flat[0], soa_flat[1], soa_flat[2],
|
||||
X, Y, Z, bbox, Symmetry);
|
||||
gpu_sommerfeld_routbam(g_buf.slot[S_f_arr], g_buf.slot[S_Gamya],
|
||||
propspeed[1],
|
||||
soa_flat[3], soa_flat[4], soa_flat[5],
|
||||
X, Y, Z, bbox, Symmetry);
|
||||
}
|
||||
|
||||
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[0], g_buf.slot[S_Gamxa],
|
||||
ctx.d_escalar_accum[0], dT, RK4);
|
||||
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_escalar0[1], g_buf.slot[S_Gamya],
|
||||
ctx.d_escalar_accum[1], dT, RK4);
|
||||
|
||||
try_pin_escalar_host_buffer(Sphi_out_host, bytes);
|
||||
try_pin_escalar_host_buffer(Spi_out_host, bytes);
|
||||
CUDA_CHECK(cudaMemcpyAsync(Sphi_out_host, g_buf.slot[S_Gamxa], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaMemcpyAsync(Spi_out_host, g_buf.slot[S_Gamya], bytes, cudaMemcpyDeviceToHost));
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
(void)Lev;
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_rk4_substep(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
|
||||
@@ -69,6 +69,21 @@ int bssn_cuda_compute_escalar_matter(void *block_tag,
|
||||
int &co,
|
||||
int &apply_enforce_ga);
|
||||
|
||||
int bssn_cuda_escalar_finalize_scalar_fields(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double *Sphi_out_host,
|
||||
double *Spi_out_host,
|
||||
const double *propspeed,
|
||||
const double *soa_flat,
|
||||
const double *bbox,
|
||||
double &dT,
|
||||
int &RK4,
|
||||
int &apply_bam_bc,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &precor);
|
||||
|
||||
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
||||
int state_index,
|
||||
double *host_state,
|
||||
|
||||
@@ -151,6 +151,8 @@ def _gpu_runtime_env():
|
||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
|
||||
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
|
||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
|
||||
"AMSS_CUDA_PIN_ESCALAR_TRANSFERS": "0",
|
||||
"AMSS_ESCALAR_GPU_RK": "0",
|
||||
}
|
||||
if getattr(input_data, "Equation_Class", "") == "Z4C":
|
||||
defaults["AMSS_CUDA_Z4C_KEEP_RESIDENT_AFTER_STEP"] = "0"
|
||||
@@ -287,6 +289,8 @@ def run_ABE():
|
||||
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
|
||||
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
|
||||
print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")
|
||||
print(f" AMSS_CUDA_PIN_ESCALAR_TRANSFERS={mpi_env.get('AMSS_CUDA_PIN_ESCALAR_TRANSFERS', '')}")
|
||||
print(f" AMSS_ESCALAR_GPU_RK={mpi_env.get('AMSS_ESCALAR_GPU_RK', '')}")
|
||||
if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
|
||||
print(f" CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user