Optimize BSSN-EScalar CUDA path

This commit is contained in:
2026-05-05 10:47:46 +08:00
parent 06f62dee36
commit 85fe29cc2e
9 changed files with 1821 additions and 276 deletions

View File

@@ -484,6 +484,10 @@ int main(int argc, char *argv[])
cout << endl;
}
// Let the process teardown reclaim the simulation object. Some derived
// equation classes keep MPI/CUDA-backed state whose destructor ordering
// is fragile at program shutdown.
if (getenv("AMSS_DELETE_ADM_ON_EXIT"))
delete ADM;
//=======================caculation done=============================================================

View File

@@ -18,6 +18,7 @@
#endif
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
#endif
#if USE_CUDA_Z4C
#include "z4c_rhs_cuda.h"
@@ -179,10 +180,12 @@ bool cuda_build_bssn_host_views(Block *block,
int state_count,
double **views)
{
if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT)
if (!block || !vars || !views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false;
MyList<var> *v = vars;
for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
for (int i = 0; i < state_count; ++i)
{
if (!v)
return false;
@@ -196,10 +199,12 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
int state_count,
double *soa_flat)
{
if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT)
if (!vars || !soa_flat ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false;
MyList<var> *v = vars;
for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i)
for (int i = 0; i < state_count; ++i)
{
if (!v)
return false;
@@ -317,7 +322,7 @@ bool cuda_state_count_direct_supported(int state_count)
#if USE_CUDA_Z4C && (ABEtype == 2)
return state_count == Z4C_CUDA_STATE_COUNT;
#elif USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
#else
(void)state_count;
return false;
@@ -372,22 +377,68 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type)
#endif
}
bool cuda_amr_host_staged_enabled();
double *alloc_device_comm_buffer(int length);
void free_device_comm_buffer(double *&ptr);
bool cuda_direct_pack_segment_to_device(double *buffer,
const Parallel::gridseg *src,
const Parallel::gridseg *dst,
int state_count,
int type,
MyList<var> *VarLists,
int Symmetry);
bool cuda_direct_pack_segment(double *buffer,
const Parallel::gridseg *src,
const Parallel::gridseg *dst,
int state_count,
MyList<var> *VarLists)
int type,
MyList<var> *VarLists,
int Symmetry)
{
#if USE_CUDA_Z4C && (ABEtype == 2)
if (state_count != Z4C_CUDA_STATE_COUNT)
return false;
#elif USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
#else
return false;
#endif
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
if (type == 2 || type == 3)
{
#if USE_CUDA_BSSN
if (!cuda_amr_host_staged_enabled())
return false;
const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2];
const int total = state_count * region_all;
static double *stage_dev = 0;
static int stage_cap = 0;
if (total > stage_cap)
{
free_device_comm_buffer(stage_dev);
stage_dev = alloc_device_comm_buffer(total);
stage_cap = total;
}
if (!cuda_direct_pack_segment_to_device(stage_dev, src, dst, state_count, type, VarLists, Symmetry))
return false;
cudaError_t cerr = cudaMemcpy(buffer, stage_dev, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost);
if (cerr != cudaSuccess)
{
fprintf(stderr, "Parallel: CUDA host-staged AMR pack cudaMemcpy failed, err=%d\n", (int)cerr);
return false;
}
if (sync_profile_enabled())
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
return true;
#else
return false;
#endif
}
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
@@ -396,7 +447,7 @@ bool cuda_direct_pack_segment(double *buffer,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else
double *views[BSSN_CUDA_STATE_COUNT];
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
const bool ok = have_views
? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(
@@ -422,7 +473,7 @@ bool cuda_direct_unpack_segment(double *buffer,
if (state_count != Z4C_CUDA_STATE_COUNT)
return false;
#elif USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
#else
return false;
@@ -436,7 +487,7 @@ bool cuda_direct_unpack_segment(double *buffer,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else
double *views[BSSN_CUDA_STATE_COUNT];
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
const bool ok = have_views
? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(
@@ -464,6 +515,17 @@ bool cuda_aware_mpi_enabled()
return enabled != 0;
}
bool cuda_cached_device_buffers_enabled(int state_count)
{
#if USE_CUDA_BSSN
if (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
return false;
#else
(void)state_count;
#endif
return cuda_aware_mpi_enabled();
}
bool cuda_amr_restrict_device_enabled()
{
static int enabled = -1;
@@ -486,6 +548,17 @@ bool cuda_amr_prolong_device_enabled()
return enabled != 0;
}
bool cuda_amr_host_staged_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_AMR_HOST_STAGED");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool cuda_amr_restrict_compare_enabled()
{
static int enabled = -1;
@@ -627,12 +700,12 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
}
#endif
#if USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
bool ok = false;
double *views[BSSN_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat);
if (type == 1)
@@ -812,13 +885,13 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
}
#endif
#if USE_CUDA_BSSN
if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
double *views[BSSN_CUDA_STATE_COUNT];
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
const bool ok = have_views
? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(
@@ -843,12 +916,12 @@ bool cuda_download_resident_subset_to_host(Block *block,
int state_count)
{
#if USE_CUDA_BSSN
if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT)
if (!block || state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
if (bssn_cuda_has_resident_state(block) == 0)
return true;
int indices[BSSN_CUDA_STATE_COUNT];
double *views[BSSN_CUDA_STATE_COUNT];
int indices[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
MyList<var> *v = vars;
for (int i = 0; i < state_count; ++i)
{
@@ -871,7 +944,7 @@ bool cuda_unpack_host_region_to_resident(Block *block,
const Parallel::gridseg *dst)
{
#if USE_CUDA_BSSN
if (!block || !dst || state_index < 0 || state_index >= BSSN_CUDA_STATE_COUNT)
if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
if (bssn_cuda_has_resident_state(block) == 0)
return true;
@@ -895,7 +968,7 @@ bool cuda_device_state_count_supported(int state_count)
return true;
#endif
#if USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT;
return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
#else
(void)state_count;
return false;
@@ -915,8 +988,8 @@ bool cuda_flush_device_segment_batch(Block *block,
return true;
const int stride = (dir == PACK && type == 3) ? 11 : 8;
const int segment_count = (int)(meta.size() / stride);
double *views[BSSN_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_CUDA_STATE_COUNT];
double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views);
const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat);
if (dir == PACK)
@@ -5022,14 +5095,17 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
{
#if USE_CUDA_BSSN || USE_CUDA_Z4C
bool handled_by_cuda = false;
if (dir == PACK && (type == 1 || s_cuda_aware_pack_active) &&
const bool host_staged_amr =
dir == PACK && !s_cuda_aware_pack_active && (type == 2 || type == 3) &&
cuda_amr_host_staged_enabled();
if (dir == PACK && (type == 1 || s_cuda_aware_pack_active || host_staged_amr) &&
cuda_state_count_direct_supported(state_count) &&
cuda_can_direct_pack(src->data, dst->data, type))
{
if (s_cuda_aware_pack_active) {
handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
} else {
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists);
handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry);
}
if (!handled_by_cuda)
{
@@ -5037,7 +5113,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active) &&
else if (dir == UNPACK && (type == 1 || s_cuda_aware_pack_active || host_staged_amr) &&
cuda_state_count_direct_supported(state_count) &&
cuda_can_direct_unpack(dst->data, type))
{
@@ -5102,7 +5178,8 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
if (cuda_state_count_direct_supported(state_count) &&
dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg))
{
if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
if (type != 2 && type != 3 &&
!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data))
{
cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -5775,7 +5852,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
cout << "Parallel::transfer_cached: variable lists do not match." << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (cuda_aware_mpi_enabled())
if (cuda_cached_device_buffers_enabled(state_count))
{
for (int n = 0; n < cpusize; n++)
{
@@ -6094,7 +6171,7 @@ void Parallel::Sync_start(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetr
cout << "Parallel::Sync_start: variable lists do not match." << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (cuda_aware_mpi_enabled())
if (cuda_cached_device_buffers_enabled(state_count))
{
for (int n = 0; n < cpusize; n++)
{
@@ -6976,16 +7053,16 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
if (myrank == cg->rank)
{
#if USE_CUDA_BSSN
double *src1_views[BSSN_CUDA_STATE_COUNT];
double *src2_views[BSSN_CUDA_STATE_COUNT];
double *dst_views[BSSN_CUDA_STATE_COUNT];
double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2);
if (state_count == BSSN_CUDA_STATE_COUNT &&
if (cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) &&
bssn_cuda_has_resident_state(cg) &&
bssn_cuda_prepare_inter_time_level(cg, cg->shape,
bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count,
src1_views, src2_views, 0, dst_views,
2, tindex) == 0)
{
@@ -7051,18 +7128,18 @@ void Parallel::prepare_inter_time_level(Patch *Pat,
if (myrank == cg->rank)
{
#if USE_CUDA_BSSN
double *src1_views[BSSN_CUDA_STATE_COUNT];
double *src2_views[BSSN_CUDA_STATE_COUNT];
double *src3_views[BSSN_CUDA_STATE_COUNT];
double *dst_views[BSSN_CUDA_STATE_COUNT];
double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT];
const int state_count = cuda_state_var_count(VarList1, VarList2);
if (state_count == BSSN_CUDA_STATE_COUNT &&
if (cuda_state_count_direct_supported(state_count) &&
cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) &&
cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) &&
cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) &&
cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) &&
bssn_cuda_has_resident_state(cg) &&
bssn_cuda_prepare_inter_time_level(cg, cg->shape,
bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count,
src1_views, src2_views, src3_views, dst_views,
3, tindex) == 0)
{
@@ -7500,6 +7577,8 @@ void Parallel::Restrict_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
for (int i = 0; i < cpusize; i++)
cache.combined_src[i] = cache.combined_dst[i] = 0;
MyList<Parallel::gridseg> *dst = build_complete_gsl(PatcL);
for (int node = 0; node < cpusize; node++)
@@ -7561,6 +7640,8 @@ void Parallel::OutBdLow2Hi_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
for (int i = 0; i < cpusize; i++)
cache.combined_src[i] = cache.combined_dst[i] = 0;
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
for (int node = 0; node < cpusize; node++)
@@ -7613,6 +7694,8 @@ void Parallel::OutBdLow2Himix_cached(MyList<Patch> *PatcL, MyList<Patch> *PatfL,
cache.tc_req_is_recv = new int[cache.max_reqs];
cache.tc_completed = new int[cache.max_reqs];
}
for (int i = 0; i < cpusize; i++)
cache.combined_src[i] = cache.combined_dst[i] = 0;
MyList<Parallel::gridseg> *dst = build_buffer_gsl(PatfL);
for (int node = 0; node < cpusize; node++)

View File

@@ -25,6 +25,9 @@ using namespace std;
#include "getnp4.h"
#include "shellfunctions.h"
#include "parameters.h"
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#endif
#ifdef With_AHF
#include "derivatives.h"
@@ -33,6 +36,152 @@ using namespace std;
//================================================================================================
namespace
{
#if USE_CUDA_BSSN
bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = 0,
double *soa_flat = 0)
{
int idx = 0;
while (vars && idx < BSSN_ESCALAR_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
if (propspeeds)
propspeeds[idx] = vars->data->propspeed;
if (soa_flat)
{
soa_flat[3 * idx + 0] = vars->data->SoA[0];
soa_flat[3 * idx + 1] = vars->data->SoA[1];
soa_flat[3 * idx + 2] = vars->data->SoA[2];
}
vars = vars->next;
++idx;
}
return idx == BSSN_ESCALAR_CUDA_STATE_COUNT && vars == 0;
}
bool bssn_escalar_cuda_use_resident_sync(int lev)
{
#ifdef WithShell
(void)lev;
return false;
#else
return true;
#endif
}
bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
{
static int keep_all_levels = -1;
if (keep_all_levels < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS");
keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
}
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
if (!enabled)
return false;
if (lev == analysis_lev)
return false;
if (keep_all_levels)
return true;
return lev < trfls_in;
}
bool bssn_escalar_sync_merged_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_SYNC_MERGED");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void bssn_escalar_sync_level(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry)
{
if (bssn_escalar_sync_merged_enabled())
Parallel::Sync_merged(PatL, VarList, Symmetry);
else
Parallel::Sync(PatL, VarList, Symmetry);
}
bool bssn_escalar_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ESCALAR_STEP_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs,
double sync, double bh, double analysis, double swap,
double resident, double rp)
{
if (!bssn_escalar_timing_enabled())
return;
double local[8] = {total, rhs, sync, bh, analysis, swap, resident, rp};
double maxv[8] = {};
MPI_Reduce(local, maxv, 8, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr,
"[AMSS-ESCALAR-STEP] lev=%d YN=%d total=%.6f rhs=%.6f sync=%.6f "
"bh=%.6f analysis=%.6f swap=%.6f resident=%.6f rp=%.6f other=%.6f\n",
lev, YN, maxv[0], maxv[1], maxv[2], maxv[3], maxv[4], maxv[5],
maxv[6], maxv[7],
maxv[0] - maxv[1] - maxv[2] - maxv[3] - maxv[4] - maxv[5] - maxv[6] - maxv[7]);
}
void bssn_escalar_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank, bool release_ctx)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, vars, state_out))
{
cout << "CUDA BSSN-EScalar resident state list mismatch during download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out))
{
cout << "CUDA BSSN-EScalar resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (release_ctx)
bssn_cuda_release_step_ctx(cg);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
#endif
}
//================================================================================================
// Define bssnEScalar_class
// It inherits some members and methods from the parent class bssn_class and modifies others.
@@ -179,6 +328,11 @@ void bssnEScalar_class::Initialize()
bssnEScalar_class::~bssnEScalar_class()
{
#if USE_CUDA_BSSN
for (int lev = 0; GH && lev < GH->levels; ++lev)
bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, true);
#endif
delete Sphio;
delete Spio;
delete Sphi0;
@@ -708,6 +862,11 @@ void bssnEScalar_class::Read_Pablo()
void bssnEScalar_class::Step(int lev, int YN)
{
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#if USE_CUDA_BSSN
const bool use_cuda_resident_sync = bssn_escalar_cuda_use_resident_sync(lev);
#else
const bool use_cuda_resident_sync = false;
#endif
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
@@ -719,9 +878,19 @@ void bssnEScalar_class::Step(int lev, int YN)
int iter_count = 0; // count RK4 substeps
int pre = 0, cor = 1;
int ERROR = 0;
const bool escalar_step_timing = bssn_escalar_timing_enabled();
const double escalar_step_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
double escalar_t_rhs = 0.0;
double escalar_t_sync = 0.0;
double escalar_t_bh = 0.0;
double escalar_t_analysis = 0.0;
double escalar_t_swap = 0.0;
double escalar_t_resident = 0.0;
double escalar_t_rp = 0.0;
MyList<ss_patch> *sPp;
// Predictor
double escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -732,14 +901,59 @@ void bssnEScalar_class::Step(int lev, int YN)
if (myrank == cg->rank)
{
#if (AGM == 0)
#if !USE_CUDA_BSSN
f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
#endif
if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
bool used_gpu_substep = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_out))
{
cout << "CUDA BSSN-EScalar state list mismatch on predictor step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#endif
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
if (bssn_escalar_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, pre,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA BSSN-EScalar predictor substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (!used_gpu_substep &&
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -783,6 +997,8 @@ void bssnEScalar_class::Step(int lev, int YN)
ERROR = 1;
}
if (!used_gpu_substep)
{
// rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here
@@ -823,6 +1039,7 @@ void bssnEScalar_class::Step(int lev, int YN)
}
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
@@ -845,6 +1062,8 @@ void bssnEScalar_class::Step(int lev, int YN)
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (escalar_step_timing)
escalar_t_rhs += MPI_Wtime() - escalar_t0;
#ifdef WithShell
// evolve Shell Patches
@@ -993,7 +1212,14 @@ void bssnEScalar_class::Step(int lev, int YN)
}
#endif
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
bssn_escalar_sync_level(GH->PatL[lev], SynchList_pre, Symmetry);
#else
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#endif
if (escalar_step_timing)
escalar_t_sync += MPI_Wtime() - escalar_t0;
#ifdef WithShell
if (lev == 0)
@@ -1016,6 +1242,10 @@ void bssnEScalar_class::Step(int lev, int YN)
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
(void)use_cuda_resident_sync;
#endif
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
@@ -1044,16 +1274,26 @@ void bssnEScalar_class::Step(int lev, int YN)
DG_List->clearList();
}
}
if (escalar_step_timing)
escalar_t_bh += MPI_Wtime() - escalar_t0;
}
// data analysis part
// Warning NOTE: the variables1 are used as temp storege room
if (lev == a_lev)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
if (use_cuda_resident_sync)
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
#endif
AnalysisStuff_EScalar(lev, dT_lev);
if (escalar_step_timing)
escalar_t_analysis += MPI_Wtime() - escalar_t0;
}
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
@@ -1067,11 +1307,13 @@ void bssnEScalar_class::Step(int lev, int YN)
if (myrank == cg->rank)
{
#if (AGM == 0)
#if !USE_CUDA_BSSN
f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif
#elif (AGM == 1)
if (iter_count == 3)
f_enforce_ga(cg->shape,
@@ -1081,7 +1323,50 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif
if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
bool used_gpu_substep = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
!fill_bssn_escalar_cuda_views(cg, SynchList_cor, state_out))
{
cout << "CUDA BSSN-EScalar state list mismatch on corrector step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#endif
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
if (bssn_escalar_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, cor,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA BSSN-EScalar corrector substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (!used_gpu_substep &&
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1125,6 +1410,8 @@ void bssnEScalar_class::Step(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (!used_gpu_substep)
{
// rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList;
@@ -1168,6 +1455,7 @@ void bssnEScalar_class::Step(int lev, int YN)
}
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
@@ -1192,6 +1480,8 @@ void bssnEScalar_class::Step(int lev, int YN)
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (escalar_step_timing)
escalar_t_rhs += MPI_Wtime() - escalar_t0;
#ifdef WithShell
// evolve Shell Patches
@@ -1349,7 +1639,14 @@ void bssnEScalar_class::Step(int lev, int YN)
}
#endif
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
bssn_escalar_sync_level(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#endif
if (escalar_step_timing)
escalar_t_sync += MPI_Wtime() - escalar_t0;
#ifdef WithShell
if (lev == 0)
@@ -1371,6 +1668,10 @@ void bssnEScalar_class::Step(int lev, int YN)
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
#if USE_CUDA_BSSN
(void)use_cuda_resident_sync;
#endif
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
@@ -1399,10 +1700,13 @@ void bssnEScalar_class::Step(int lev, int YN)
DG_List->clearList();
}
}
if (escalar_step_timing)
escalar_t_bh += MPI_Wtime() - escalar_t0;
}
// swap time level
if (iter_count < 3)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
Pp = GH->PatL[lev];
while (Pp)
{
@@ -1446,12 +1750,28 @@ void bssnEScalar_class::Step(int lev, int YN)
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
if (escalar_step_timing)
escalar_t_swap += MPI_Wtime() - escalar_t0;
}
}
#if USE_CUDA_BSSN
if (use_cuda_resident_sync)
{
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev))
bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
if (escalar_step_timing)
escalar_t_resident += MPI_Wtime() - escalar_t0;
}
#endif
#if (RPS == 0)
// mesh refinement boundary part
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
RestrictProlong(lev, YN, BB);
if (escalar_step_timing)
escalar_t_rp += MPI_Wtime() - escalar_t0;
#ifdef WithShell
if (lev == 0)
@@ -1478,6 +1798,7 @@ void bssnEScalar_class::Step(int lev, int YN)
//
// OldStateList old -----------
// update
escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0;
Pp = GH->PatL[lev];
while (Pp)
{
@@ -1523,6 +1844,14 @@ void bssnEScalar_class::Step(int lev, int YN)
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
if (escalar_step_timing)
{
escalar_t_swap += MPI_Wtime() - escalar_t0;
bssn_escalar_timing_report(myrank, lev, YN, MPI_Wtime() - escalar_step_t0,
escalar_t_rhs, escalar_t_sync, escalar_t_bh,
escalar_t_analysis, escalar_t_swap,
escalar_t_resident, escalar_t_rp);
}
}
//================================================================================================
@@ -2077,7 +2406,37 @@ void bssnEScalar_class::Constraint_Out()
Block *cg = BP->data;
if (myrank == cg->rank)
{
if (lev > 0)
bool used_cuda_constraints = false;
#if USE_CUDA_BSSN
{
double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in))
{
cout << "CUDA BSSN-EScalar constraint state list mismatch" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
double *constraint_out[8] = {
cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn],
cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn],
cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_fR->sgfn]};
int lev_arg = lev;
int sym_arg = Symmetry;
double eps_arg = ndeps;
if (bssn_escalar_cuda_compute_constraints(cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, constraint_out,
sym_arg, lev_arg, eps_arg))
{
cout << "CUDA BSSN-EScalar constraint compute failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
used_cuda_constraints = true;
}
#endif
if (!used_cuda_constraints && lev > 0)
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
@@ -2114,6 +2473,7 @@ void bssnEScalar_class::Constraint_Out()
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre);
if (!used_cuda_constraints)
f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn],

View File

@@ -70,6 +70,125 @@ int amss_analysis_map_every()
return every;
}
bool amss_rp_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_RP_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool amss_rp_detail_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_RP_DETAIL_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool amss_env_flag_enabled(const char *name)
{
const char *env = getenv(name);
return env && atoi(env) != 0;
}
bool amss_cached_rp_restrict_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_outbd_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_fine_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_cached_rp_coarse_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_CACHED_COARSE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_rp_skip_coarse_sync_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_RP_SKIP_COARSE_SYNC") ? 1 : 0;
return enabled != 0;
}
bool amss_evolve_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
enabled = amss_env_flag_enabled("AMSS_EVOLVE_TIMING") ? 1 : 0;
return enabled != 0;
}
struct AmssEvolveTimingStats
{
double step;
double rp;
double regrid;
double constraint;
};
AmssEvolveTimingStats &amss_evolve_timing_stats()
{
static AmssEvolveTimingStats stats = {};
return stats;
}
void amss_evolve_timing_reset()
{
AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
stats.step = 0.0;
stats.rp = 0.0;
stats.regrid = 0.0;
stats.constraint = 0.0;
}
void amss_evolve_timing_add_step(double sec)
{
amss_evolve_timing_stats().step += sec;
}
void amss_evolve_timing_add_rp(double sec)
{
amss_evolve_timing_stats().rp += sec;
}
void amss_evolve_timing_add_regrid(double sec)
{
amss_evolve_timing_stats().regrid += sec;
}
void amss_evolve_timing_add_constraint(double sec)
{
amss_evolve_timing_stats().constraint += sec;
}
}
// Compile-time switch for per-timestep memory usage collection/printing.
@@ -288,6 +407,37 @@ bool fill_bssn_cuda_views(Block *cg, MyList<var> *vars,
return idx == BSSN_CUDA_STATE_COUNT && vars == 0;
}
int count_bssn_cuda_state_list(MyList<var> *vars)
{
int count = 0;
while (vars)
{
++count;
vars = vars->next;
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
return -1;
}
return count;
}
bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
int state_count,
double **host_views)
{
if (!cg || !host_views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
return false;
int idx = 0;
while (vars && idx < state_count)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
vars = vars->next;
++idx;
}
return idx == state_count && vars == 0;
}
bool bssn_cuda_use_resident_sync(int lev)
{
#ifdef WithShell
@@ -467,6 +617,11 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{
var *vars[3] = {forx, fory, forz};
double *bh_host_key[3] = {
block->fgfs[forx->sgfn],
block->fgfs[fory->sgfn],
block->fgfs[forz->sgfn]
};
double soa3[9];
for (int f = 0; f < 3; f++)
{
@@ -482,6 +637,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
DH[0], DH[1], DH[2],
x, y, z,
interp_ordn, interp_sym,
bh_host_key,
soa3, shellf) != 0)
{
const int sx = ordn;
@@ -552,6 +708,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
{
const int state_count = count_bssn_cuda_state_list(vars);
MyList<Patch> *Pp = PatL;
while (Pp)
{
@@ -561,13 +718,16 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out))
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_cuda_download_resident_state(cg, cg->shape, state_out))
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
if (rc)
{
cout << "CUDA resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -585,6 +745,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank)
{
const int state_count = count_bssn_cuda_state_list(vars);
MyList<Patch> *Pp = PatL;
while (Pp)
{
@@ -594,13 +755,13 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views(cg, vars, state_out))
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out))
if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count))
{
cout << "CUDA resident state conditional download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
@@ -2890,6 +3051,10 @@ void bssn_class::Evolve(int Steps)
for (int ncount = 1; ncount < Steps + 1; ncount++)
{
const bool evolve_timing = amss_evolve_timing_enabled();
const double evolve_t0 = evolve_timing ? MPI_Wtime() : 0.0;
if (evolve_timing)
amss_evolve_timing_reset();
cuda_level0_constraint_cache_valid = false;
#if BSSN_FINE_TIMING
step_timing::reset();
@@ -2918,9 +3083,12 @@ void bssn_class::Evolve(int Steps)
// misc::tillherecheck("before Constraint_Out");
const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0;
STEP_TIMER_DECL(timer_constraint_out);
Constraint_Out(); // this will affect the Dump_List
STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
if (evolve_timing)
amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0);
LastDump += dT_mon;
Last2dDump += dT_mon;
@@ -3093,6 +3261,22 @@ void bssn_class::Evolve(int Steps)
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start);
#endif
if (evolve_timing)
{
const AmssEvolveTimingStats &stats = amss_evolve_timing_stats();
const double local[4] = {stats.step, stats.rp, stats.regrid, stats.constraint};
double maxv[4] = {};
MPI_Reduce((void *)local, maxv, 4, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
{
const double wall = MPI_Wtime() - evolve_t0;
const double known = maxv[0] + maxv[1] + maxv[2] + maxv[3];
fprintf(stderr,
"[AMSS-EVOLVE-TIMING] step=%d wall=%.6f step_fn=%.6f rp=%.6f "
"regrid=%.6f constraint=%.6f other=%.6f\n",
ncount, wall, maxv[0], maxv[1], maxv[2], maxv[3], wall - known);
}
}
}
/*
#ifdef With_AHF
@@ -3162,7 +3346,11 @@ void bssn_class::RecursiveStep(int lev)
{
// if(myrank==0) cout<<"level now = "<<lev<<" NoIteration = "<<i<<endl;
YN = (i == NoIterations - 1) ? 1 : 0; // 1: same time level for coarse level and fine level
const bool evolve_timing = amss_evolve_timing_enabled();
const double step_t0 = evolve_timing ? MPI_Wtime() : 0.0;
Step(lev, YN);
if (evolve_timing)
amss_evolve_timing_add_step(MPI_Wtime() - step_t0);
#if (AGM == 2)
if (GH->levels == 1)
@@ -3195,7 +3383,10 @@ void bssn_class::RecursiveStep(int lev)
//
// till here the PhysTime has updated dT_lev
// if(myrank==0) cout<<"level now = "<<lev<<", "<<fgt(PhysTime-dT_lev,StartTime,dT_lev/2)<<endl;
const double rp_t0 = evolve_timing ? MPI_Wtime() : 0.0;
RestrictProlong(lev, YN, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), StateList, OldStateList, SynchList_cor);
if (evolve_timing)
amss_evolve_timing_add_rp(MPI_Wtime() - rp_t0);
// RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor);
#ifdef WithShell
@@ -3224,6 +3415,8 @@ void bssn_class::RecursiveStep(int lev)
#endif
#if (REGLEV == 0)
const bool evolve_timing = amss_evolve_timing_enabled();
const double regrid_t0 = evolve_timing ? MPI_Wtime() : 0.0;
STEP_TIMER_DECL(timer_regrid_onelevel);
#if USE_CUDA_BSSN
if (bssn_cuda_should_flush_before_regrid(GH, lev, Symmetry, BH_num, Porg0))
@@ -3242,6 +3435,8 @@ void bssn_class::RecursiveStep(int lev)
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
#endif
}
if (evolve_timing)
amss_evolve_timing_add_regrid(MPI_Wtime() - regrid_t0);
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
#endif
}
@@ -6847,6 +7042,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
//
// SynchList_cor old -----------
{
const bool rp_runtime_timing = amss_rp_timing_enabled();
const double rp_runtime_start = rp_runtime_timing ? MPI_Wtime() : 0.0;
const bool rp_detail_timing = amss_rp_detail_timing_enabled();
double rp_t_prepare = 0.0;
double rp_t_restrict = 0.0;
double rp_t_coarse_sync = 0.0;
double rp_t_outbd = 0.0;
double rp_t_fine_sync = 0.0;
double rp_t0 = 0.0;
STEP_TIMER_DECL(timer_restrict_prolong);
#if (PSTR == 1 || PSTR == 2)
// stringstream a_stream;
@@ -6858,6 +7062,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
MyList<Patch> *Pp, *Ppc;
if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level
{
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Pp = GH->PatL[lev - 1];
while (Pp)
{
@@ -6873,6 +7078,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
Pp = Pp->next;
}
if (rp_detail_timing) rp_t_prepare += MPI_Wtime() - rp_t0;
#if (PSTR == 1 || PSTR == 2)
// Pp=GH->PatL[lev];
@@ -6889,14 +7095,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
#endif
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry);
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -6907,10 +7117,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#endif
#endif
@@ -6922,6 +7136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1];
@@ -6941,9 +7156,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry);
#endif
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry);
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -6964,14 +7182,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (ABEtype == 1 || ABEtype == 2)
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]);
#endif
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry);
if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -6982,10 +7204,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]);
if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0;
#endif
#endif
@@ -6997,6 +7223,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#endif
#if (RPB == 0)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
Ppc = GH->PatL[lev - 1];
@@ -7016,9 +7243,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
#elif (MIXOUTB == 1)
Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry);
#endif
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#elif (RPB == 1)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
// Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry);
Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry);
if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -7030,9 +7260,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
}
#if (ABEtype == 1 || ABEtype == 2)
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync(GH->PatL[lev], SL, Symmetry);
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
#else
if (rp_detail_timing) rp_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0;
#endif
#if (PSTR == 1 || PSTR == 2)
@@ -7042,6 +7276,27 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
}
if (rp_runtime_timing)
{
const double local_sec = MPI_Wtime() - rp_runtime_start;
double max_sec = 0.0;
MPI_Reduce((void *)&local_sec, &max_sec, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr, "[AMSS-RP-TIMING] lev=%d YN=%d BB=%d sec=%.6f\n",
lev, YN, BB ? 1 : 0, max_sec);
}
if (rp_detail_timing)
{
double local_detail[5] = {rp_t_prepare, rp_t_restrict, rp_t_coarse_sync, rp_t_outbd, rp_t_fine_sync};
double max_detail[5] = {};
MPI_Reduce(local_detail, max_detail, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
if (myrank == 0)
fprintf(stderr,
"[AMSS-RP-DETAIL] lev=%d YN=%d BB=%d prepare=%.6f restrict=%.6f "
"coarse_sync=%.6f outbd=%.6f fine_sync=%.6f\n",
lev, YN, BB ? 1 : 0, max_detail[0], max_detail[1],
max_detail[2], max_detail[3], max_detail[4]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
@@ -7229,6 +7484,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
if (amss_cached_rp_restrict_enabled())
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
else
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]);
@@ -7239,6 +7497,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif
#if (ABEtype == 1 || ABEtype == 2)
if (amss_rp_skip_coarse_sync_enabled())
{
}
else if (amss_cached_rp_coarse_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]);
else
Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7249,6 +7513,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
if (amss_cached_rp_outbd_enabled())
{
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
}
else
{
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7260,6 +7530,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
}
Ppc = Ppc->next;
}
}
#else
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
#endif
@@ -7277,6 +7548,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl;
#if (RPB == 0)
#if (ABEtype == 1 || ABEtype == 2)
if (amss_cached_rp_restrict_enabled())
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
else
Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry);
#else
Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]);
@@ -7287,6 +7561,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#endif
#if (ABEtype == 1 || ABEtype == 2)
if (amss_rp_skip_coarse_sync_enabled())
{
}
else if (amss_cached_rp_coarse_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]);
else
Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry);
#else
#if (RP_SYNC_COARSE_AFTER_RESTRICT == 1)
@@ -7297,6 +7577,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
#if (RPB == 0)
#if (MIXOUTB == 0)
#if (ABEtype == 1 || ABEtype == 2)
if (amss_cached_rp_outbd_enabled())
{
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
}
else
{
Ppc = GH->PatL[lev - 1];
while (Ppc)
{
@@ -7308,6 +7594,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
}
Ppc = Ppc->next;
}
}
#else
Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]);
#endif
@@ -7321,6 +7608,9 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
}
#if (ABEtype == 1 || ABEtype == 2)
if (amss_cached_rp_fine_sync_enabled())
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
else
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#else
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);

View File

@@ -144,7 +144,7 @@ public:
bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
int a_levi, int maxli, int decni, double maxrexi, double drexi);
~bssn_class();
virtual ~bssn_class();
void Evolve(int Steps);
void RecursiveStep(int lev);

File diff suppressed because it is too large Load Diff

View File

@@ -7,6 +7,7 @@ extern "C" {
enum {
BSSN_CUDA_STATE_COUNT = 24,
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
BSSN_CUDA_MATTER_COUNT = 10
};
@@ -55,6 +56,32 @@ int bssn_cuda_rk4_substep(void *block_tag,
int &apply_enforce_ga,
double &chitiny);
int bssn_escalar_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **constraint_host_out,
int &Symmetry,
int &Lev,
double &eps);
int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index,
double *host_state,
@@ -73,6 +100,15 @@ int bssn_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int bssn_escalar_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
int *ex,
double **state_host_out,
int state_count);
int bssn_cuda_download_resident_state_if_present(void *block_tag,
int *ex,
double **state_host_out);
@@ -103,6 +139,7 @@ int bssn_cuda_interp_state_point3(void *block_tag,
double pz,
int ordn,
int symmetry,
double **state_host_key,
const double *soa3,
double *out3);
@@ -302,6 +339,7 @@ int bssn_cuda_upload_state_subset(void *block_tag,
int bssn_cuda_prepare_inter_time_level(void *block_tag,
int *ex,
int state_count,
double **src1_host_key,
double **src2_host_key,
double **src3_host_key,

View File

@@ -35,7 +35,6 @@ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
endif
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc $(MKL_INC)
else
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers

View File

@@ -146,6 +146,7 @@ def _gpu_runtime_env():
"AMSS_CUDA_AWARE_MPI": "1",
"AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
"AMSS_CUDA_KEEP_ALL_LEVELS": "1",
"AMSS_CUDA_AMR_HOST_STAGED": "1",
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
@@ -277,6 +278,7 @@ def run_ABE():
print(f" AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}")
print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")