Add EM GPU fast paths and defaults
This commit is contained in:
@@ -18,7 +18,7 @@
|
||||
#endif
|
||||
#if USE_CUDA_BSSN
|
||||
#include "bssn_rhs_cuda.h"
|
||||
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
|
||||
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_EM_CUDA_STATE_COUNT
|
||||
#endif
|
||||
#if USE_CUDA_Z4C
|
||||
#include "z4c_rhs_cuda.h"
|
||||
@@ -181,8 +181,7 @@ bool cuda_build_bssn_host_views(Block *block,
|
||||
double **views)
|
||||
{
|
||||
if (!block || !vars || !views ||
|
||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
||||
state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
|
||||
return false;
|
||||
MyList<var> *v = vars;
|
||||
for (int i = 0; i < state_count; ++i)
|
||||
@@ -200,8 +199,7 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
|
||||
double *soa_flat)
|
||||
{
|
||||
if (!vars || !soa_flat ||
|
||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
||||
state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
|
||||
return false;
|
||||
MyList<var> *v = vars;
|
||||
for (int i = 0; i < state_count; ++i)
|
||||
@@ -322,7 +320,7 @@ bool cuda_state_count_direct_supported(int state_count)
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
return state_count == Z4C_CUDA_STATE_COUNT;
|
||||
#elif USE_CUDA_BSSN
|
||||
return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
|
||||
return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
|
||||
#else
|
||||
(void)state_count;
|
||||
return false;
|
||||
@@ -550,7 +548,8 @@ bool cuda_uncached_device_buffers_enabled(int state_count)
|
||||
}
|
||||
if (!enabled)
|
||||
return false;
|
||||
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)
|
||||
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT &&
|
||||
state_count != BSSN_EM_CUDA_STATE_COUNT)
|
||||
return false;
|
||||
return cuda_aware_mpi_enabled();
|
||||
#else
|
||||
@@ -6136,6 +6135,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache)
|
||||
{
|
||||
const double t_transfer = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||
int myrank;
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
@@ -6324,6 +6324,13 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
|
||||
else
|
||||
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
|
||||
}
|
||||
if (sync_profile_enabled())
|
||||
{
|
||||
SyncProfileStats &stats = sync_profile_stats();
|
||||
stats.finish_calls++;
|
||||
stats.finish_sec += MPI_Wtime() - t_transfer;
|
||||
sync_profile_maybe_log();
|
||||
}
|
||||
}
|
||||
void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
|
||||
{
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
#ifdef newc
|
||||
#include <sstream>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <map>
|
||||
using namespace std;
|
||||
#else
|
||||
@@ -10,6 +11,7 @@ using namespace std;
|
||||
#endif
|
||||
|
||||
#include <time.h>
|
||||
#include <cstring>
|
||||
|
||||
#include "macrodef.h"
|
||||
#include "misc.h"
|
||||
@@ -19,6 +21,9 @@ using namespace std;
|
||||
#include "bssnEM_class.h"
|
||||
#include "bssn_rhs.h"
|
||||
#include "empart.h"
|
||||
#if USE_CUDA_BSSN
|
||||
#include "bssn_rhs_cuda.h"
|
||||
#endif
|
||||
#include "initial_puncture.h"
|
||||
#include "initial_maxwell.h"
|
||||
#include "enforce_algebra.h"
|
||||
@@ -36,6 +41,363 @@ using namespace std;
|
||||
|
||||
//================================================================================================
|
||||
|
||||
namespace
|
||||
{
|
||||
MyList<var> *advance_var_list(MyList<var> *vars, int count)
|
||||
{
|
||||
while (vars && count > 0)
|
||||
{
|
||||
vars = vars->next;
|
||||
--count;
|
||||
}
|
||||
return vars;
|
||||
}
|
||||
|
||||
bool bssn_em_step_timing_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_EM_STEP_TIMING");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool bssn_em_step_timing_all_levels_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_EM_STEP_TIMING_ALL_LEVELS");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
bool bssn_em_zero_analysis_fastpath_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_EM_ZERO_ANALYSIS_FASTPATH");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool bssn_em_analysis_zero_fastpath_ready(MyList<Patch> *PatL,
|
||||
#ifdef WithShell
|
||||
ShellPatch *shell,
|
||||
#else
|
||||
ShellPatch * /*shell*/,
|
||||
#endif
|
||||
int rank)
|
||||
{
|
||||
if (!bssn_em_zero_analysis_fastpath_enabled())
|
||||
return false;
|
||||
int local_ok = 1;
|
||||
int local_seen = 0;
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (rank == cg->rank)
|
||||
{
|
||||
local_seen = 1;
|
||||
if (!bssn_em_cuda_resident_zero_fast_state(cg))
|
||||
local_ok = 0;
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
#ifdef WithShell
|
||||
if (shell && shell->PatL)
|
||||
{
|
||||
MyList<ss_patch> *SP = shell->PatL;
|
||||
while (SP)
|
||||
{
|
||||
MyList<Block> *BP = SP->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (rank == cg->rank)
|
||||
{
|
||||
local_seen = 1;
|
||||
if (!bssn_em_cuda_resident_zero_fast_state(cg))
|
||||
local_ok = 0;
|
||||
}
|
||||
if (BP == SP->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
SP = SP->next;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
int global_ok = 0;
|
||||
int global_seen = 0;
|
||||
MPI_Allreduce(&local_ok, &global_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
|
||||
MPI_Allreduce(&local_seen, &global_seen, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
|
||||
return global_seen && global_ok;
|
||||
}
|
||||
|
||||
void zero_em_analysis_outputs(MyList<Patch> *PatL,
|
||||
#ifdef WithShell
|
||||
ShellPatch *shell,
|
||||
#else
|
||||
ShellPatch * /*shell*/,
|
||||
#endif
|
||||
int rank,
|
||||
var *Rphi2_var, var *Iphi2_var,
|
||||
var *Rphi1_var, var *Iphi1_var)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (rank == cg->rank)
|
||||
{
|
||||
const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
|
||||
std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
|
||||
std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
|
||||
std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
|
||||
std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
#ifdef WithShell
|
||||
if (shell && shell->PatL)
|
||||
{
|
||||
MyList<ss_patch> *SP = shell->PatL;
|
||||
while (SP)
|
||||
{
|
||||
MyList<Block> *BP = SP->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (rank == cg->rank)
|
||||
{
|
||||
const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
|
||||
std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
|
||||
std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
|
||||
std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
|
||||
std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
|
||||
}
|
||||
if (BP == SP->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
SP = SP->next;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
int bssn_em_step_timing_every()
|
||||
{
|
||||
static int every = -1;
|
||||
if (every < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_EM_STEP_TIMING_EVERY");
|
||||
every = (env && atoi(env) > 0) ? atoi(env) : 1;
|
||||
}
|
||||
return every;
|
||||
}
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
bool fill_bssn_em_bssn_cuda_views(Block *cg, MyList<var> *vars,
|
||||
double **host_views,
|
||||
double *propspeeds = 0,
|
||||
double *soa_flat = 0)
|
||||
{
|
||||
int idx = 0;
|
||||
while (vars && idx < BSSN_CUDA_STATE_COUNT)
|
||||
{
|
||||
host_views[idx] = cg->fgfs[vars->data->sgfn];
|
||||
if (propspeeds)
|
||||
propspeeds[idx] = vars->data->propspeed;
|
||||
if (soa_flat)
|
||||
{
|
||||
soa_flat[3 * idx + 0] = vars->data->SoA[0];
|
||||
soa_flat[3 * idx + 1] = vars->data->SoA[1];
|
||||
soa_flat[3 * idx + 2] = vars->data->SoA[2];
|
||||
}
|
||||
vars = vars->next;
|
||||
++idx;
|
||||
}
|
||||
return idx == BSSN_CUDA_STATE_COUNT;
|
||||
}
|
||||
|
||||
bool fill_bssn_em_cuda_views(Block *cg, MyList<var> *vars,
|
||||
double **host_views,
|
||||
double *propspeeds = 0,
|
||||
double *soa_flat = 0)
|
||||
{
|
||||
int idx = 0;
|
||||
while (vars && idx < BSSN_EM_CUDA_STATE_COUNT)
|
||||
{
|
||||
host_views[idx] = cg->fgfs[vars->data->sgfn];
|
||||
if (propspeeds)
|
||||
propspeeds[idx] = vars->data->propspeed;
|
||||
if (soa_flat)
|
||||
{
|
||||
soa_flat[3 * idx + 0] = vars->data->SoA[0];
|
||||
soa_flat[3 * idx + 1] = vars->data->SoA[1];
|
||||
soa_flat[3 * idx + 2] = vars->data->SoA[2];
|
||||
}
|
||||
vars = vars->next;
|
||||
++idx;
|
||||
}
|
||||
return idx == BSSN_EM_CUDA_STATE_COUNT && vars == 0;
|
||||
}
|
||||
|
||||
void fill_bssn_em_fixed_source_cuda_views(Block *cg, double **sources,
|
||||
var *Jx, var *Jy, var *Jz, var *qchar)
|
||||
{
|
||||
sources[0] = cg->fgfs[Jx->sgfn];
|
||||
sources[1] = cg->fgfs[Jy->sgfn];
|
||||
sources[2] = cg->fgfs[Jz->sgfn];
|
||||
sources[3] = cg->fgfs[qchar->sgfn];
|
||||
}
|
||||
|
||||
void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
|
||||
var *rho, var *Sx, var *Sy, var *Sz,
|
||||
var *Sxx, var *Sxy, var *Sxz,
|
||||
var *Syy, var *Syz, var *Szz)
|
||||
{
|
||||
matter[0] = cg->fgfs[rho->sgfn];
|
||||
matter[1] = cg->fgfs[Sx->sgfn];
|
||||
matter[2] = cg->fgfs[Sy->sgfn];
|
||||
matter[3] = cg->fgfs[Sz->sgfn];
|
||||
matter[4] = cg->fgfs[Sxx->sgfn];
|
||||
matter[5] = cg->fgfs[Sxy->sgfn];
|
||||
matter[6] = cg->fgfs[Sxz->sgfn];
|
||||
matter[7] = cg->fgfs[Syy->sgfn];
|
||||
matter[8] = cg->fgfs[Syz->sgfn];
|
||||
matter[9] = cg->fgfs[Szz->sgfn];
|
||||
}
|
||||
|
||||
bool bssn_em_cuda_use_resident_sync(int lev)
|
||||
{
|
||||
#ifdef WithShell
|
||||
(void)lev;
|
||||
return false;
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool bssn_em_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
|
||||
{
|
||||
static int keep_all_levels = -1;
|
||||
if (keep_all_levels < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS");
|
||||
keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
if (!enabled)
|
||||
return false;
|
||||
if (lev == analysis_lev)
|
||||
return false;
|
||||
if (keep_all_levels)
|
||||
return true;
|
||||
return lev < trfls_in;
|
||||
}
|
||||
|
||||
void bssn_em_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
|
||||
int myrank, bool release_ctx)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_em_cuda_views(cg, vars, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN-EM resident state list mismatch during download" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape,
|
||||
state_out,
|
||||
BSSN_EM_CUDA_STATE_COUNT))
|
||||
{
|
||||
cout << "CUDA BSSN-EM resident state download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (release_ctx)
|
||||
bssn_cuda_release_step_ctx(cg);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
|
||||
void bssn_em_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
|
||||
int myrank)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_key[BSSN_EM_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_em_cuda_views(cg, vars, state_key))
|
||||
{
|
||||
cout << "CUDA BSSN-EM resident state list mismatch during prune" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (bssn_cuda_keep_only_resident_state_count(cg, cg->shape,
|
||||
state_key,
|
||||
BSSN_EM_CUDA_STATE_COUNT))
|
||||
{
|
||||
cout << "CUDA BSSN-EM keep-only resident state failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Define bssnEM_class
|
||||
|
||||
// It inherits some members and methods from the parent class bssn_class and modifies others.
|
||||
@@ -258,6 +620,13 @@ void bssnEM_class::Initialize()
|
||||
PhysTime = StartTime;
|
||||
Setup_Black_Hole_position();
|
||||
}
|
||||
|
||||
sync_cache_pre = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_cor = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_restrict = new Parallel::SyncCache[GH->levels];
|
||||
sync_cache_outbd = new Parallel::SyncCache[GH->levels];
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
@@ -833,9 +1202,25 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
int iter_count = 0; // count RK4 substeps
|
||||
int pre = 0, cor = 1;
|
||||
int ERROR = 0;
|
||||
#if USE_CUDA_BSSN
|
||||
const bool use_cuda_resident_sync = bssn_em_cuda_use_resident_sync(lev);
|
||||
#endif
|
||||
const bool em_step_timing = bssn_em_step_timing_enabled();
|
||||
const double em_step_t0 = em_step_timing ? MPI_Wtime() : 0.0;
|
||||
double em_t0 = 0.0;
|
||||
double em_t_predictor = 0.0;
|
||||
double em_t_predictor_sync = 0.0;
|
||||
double em_t_corrector = 0.0;
|
||||
double em_t_corrector_sync = 0.0;
|
||||
double em_t_analysis = 0.0;
|
||||
double em_t_bh = 0.0;
|
||||
double em_t_swap = 0.0;
|
||||
double em_t_resident = 0.0;
|
||||
double em_t_rp = 0.0;
|
||||
|
||||
MyList<ss_patch> *sPp;
|
||||
// Predictor
|
||||
em_t0 = em_step_timing ? MPI_Wtime() : 0.0;
|
||||
MyList<Patch> *Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
@@ -845,15 +1230,20 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
#if !USE_CUDA_BSSN
|
||||
#if (AGM == 0)
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
|
||||
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
|
||||
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (
|
||||
int em_rhs_error = 0;
|
||||
bool used_gpu_substep = false;
|
||||
#if !USE_CUDA_BSSN
|
||||
em_rhs_error =
|
||||
f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
cg->fgfs[phi0->sgfn],
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
@@ -873,7 +1263,51 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
|
||||
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
|
||||
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
|
||||
Symmetry, lev, ndeps) ||
|
||||
Symmetry, lev, ndeps);
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (!em_rhs_error)
|
||||
{
|
||||
double *state_in[BSSN_EM_CUDA_STATE_COUNT];
|
||||
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||
double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
|
||||
double propspeed[BSSN_EM_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_em_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
|
||||
!fill_bssn_em_cuda_views(cg, SynchList_pre, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN-EM state list mismatch on predictor step" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
|
||||
int apply_bam_bc = 0;
|
||||
#if (SommerType == 0)
|
||||
#ifndef WithShell
|
||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||
#endif
|
||||
#endif
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
apply_enforce_ga = 1;
|
||||
#endif
|
||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||
if (bssn_em_cuda_rk4_substep(cg,
|
||||
cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
state_in, state_out, sources,
|
||||
propspeed, soa_flat, Pp->data->bbox,
|
||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, pre,
|
||||
keep_resident_state, apply_enforce_ga, chitiny))
|
||||
{
|
||||
ERROR = 1;
|
||||
}
|
||||
used_gpu_substep = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (em_rhs_error ||
|
||||
(!used_gpu_substep &&
|
||||
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
|
||||
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
|
||||
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
|
||||
@@ -907,7 +1341,7 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
cg->fgfs[Cons_Ham->sgfn],
|
||||
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
|
||||
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
|
||||
Symmetry, lev, ndeps, pre))
|
||||
Symmetry, lev, ndeps, pre)))
|
||||
{
|
||||
cout << "find NaN in domain: ("
|
||||
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||
@@ -916,6 +1350,8 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
ERROR = 1;
|
||||
}
|
||||
|
||||
if (!used_gpu_substep)
|
||||
{
|
||||
// rk4 substep and boundary
|
||||
{
|
||||
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList;
|
||||
@@ -958,12 +1394,15 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
}
|
||||
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
|
||||
}
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
if (em_step_timing)
|
||||
em_t_predictor += MPI_Wtime() - em_t0;
|
||||
// check error information
|
||||
{
|
||||
int erh = ERROR;
|
||||
@@ -1221,7 +1660,11 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
|
||||
if (em_step_timing)
|
||||
em_t_predictor_sync += MPI_Wtime() - em_t0;
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -1244,6 +1687,8 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
// for black hole position
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
@@ -1272,16 +1717,24 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
DG_List->clearList();
|
||||
}
|
||||
}
|
||||
if (em_step_timing)
|
||||
em_t_bh += MPI_Wtime() - em_t0;
|
||||
}
|
||||
// data analysis part
|
||||
// Warning NOTE: the variables1 are used as temp storege room
|
||||
if (lev == a_lev)
|
||||
{
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
AnalysisStuff_EM(lev, dT_lev);
|
||||
if (em_step_timing)
|
||||
em_t_analysis += MPI_Wtime() - em_t0;
|
||||
}
|
||||
// corrector
|
||||
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||
{
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
|
||||
if (iter_count == 1 || iter_count == 3)
|
||||
TRK4 += dT_lev / 2;
|
||||
@@ -1294,6 +1747,7 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
#if !USE_CUDA_BSSN
|
||||
#if (AGM == 0)
|
||||
f_enforce_ga(cg->shape,
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
@@ -1307,9 +1761,13 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
|
||||
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
|
||||
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (
|
||||
int em_rhs_error = 0;
|
||||
bool used_gpu_substep = false;
|
||||
#if !USE_CUDA_BSSN
|
||||
em_rhs_error =
|
||||
f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
cg->fgfs[phi->sgfn],
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
@@ -1329,7 +1787,54 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
|
||||
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
|
||||
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
|
||||
Symmetry, lev, ndeps) ||
|
||||
Symmetry, lev, ndeps);
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (!em_rhs_error)
|
||||
{
|
||||
double *state_in[BSSN_EM_CUDA_STATE_COUNT];
|
||||
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||
double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
|
||||
double propspeed[BSSN_EM_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_em_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
|
||||
!fill_bssn_em_cuda_views(cg, SynchList_cor, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN-EM state list mismatch on corrector step" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
|
||||
int apply_bam_bc = 0;
|
||||
#if (SommerType == 0)
|
||||
#ifndef WithShell
|
||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||
#endif
|
||||
#endif
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
apply_enforce_ga = 1;
|
||||
#elif (AGM == 1)
|
||||
if (iter_count == 3)
|
||||
apply_enforce_ga = 1;
|
||||
#endif
|
||||
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
|
||||
if (bssn_em_cuda_rk4_substep(cg,
|
||||
cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
state_in, state_out, sources,
|
||||
propspeed, soa_flat, Pp->data->bbox,
|
||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, cor,
|
||||
keep_resident_state, apply_enforce_ga, chitiny))
|
||||
{
|
||||
ERROR = 1;
|
||||
}
|
||||
used_gpu_substep = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (em_rhs_error ||
|
||||
(!used_gpu_substep &&
|
||||
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
|
||||
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
|
||||
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
|
||||
@@ -1362,7 +1867,7 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
cg->fgfs[Cons_Ham->sgfn],
|
||||
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
|
||||
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
|
||||
Symmetry, lev, ndeps, cor))
|
||||
Symmetry, lev, ndeps, cor)))
|
||||
{
|
||||
cout << "find NaN in domain: ("
|
||||
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||
@@ -1370,6 +1875,8 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
if (!used_gpu_substep)
|
||||
{
|
||||
// rk4 substep and boundary
|
||||
{
|
||||
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList;
|
||||
@@ -1413,6 +1920,7 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
}
|
||||
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
|
||||
}
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
@@ -1683,7 +2191,13 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
}
|
||||
#endif
|
||||
|
||||
if (em_step_timing)
|
||||
em_t_corrector += MPI_Wtime() - em_t0;
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
|
||||
if (em_step_timing)
|
||||
em_t_corrector_sync += MPI_Wtime() - em_t0;
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -1705,6 +2219,8 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
// for black hole position
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
@@ -1733,10 +2249,14 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
DG_List->clearList();
|
||||
}
|
||||
}
|
||||
if (em_step_timing)
|
||||
em_t_bh += MPI_Wtime() - em_t0;
|
||||
}
|
||||
// swap time level
|
||||
if (iter_count < 3)
|
||||
{
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
@@ -1780,12 +2300,30 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
Porg[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
if (em_step_timing)
|
||||
em_t_swap += MPI_Wtime() - em_t0;
|
||||
}
|
||||
}
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (use_cuda_resident_sync)
|
||||
{
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
if (!bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev))
|
||||
bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
|
||||
if (em_step_timing)
|
||||
em_t_resident += MPI_Wtime() - em_t0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (RPS == 0)
|
||||
// mesh refinement boundary part
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
RestrictProlong(lev, YN, BB);
|
||||
if (em_step_timing)
|
||||
em_t_rp += MPI_Wtime() - em_t0;
|
||||
|
||||
#ifdef WithShell
|
||||
if (lev == 0)
|
||||
@@ -1813,6 +2351,8 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
//
|
||||
// OldStateList old -----------
|
||||
// update
|
||||
if (em_step_timing)
|
||||
em_t0 = MPI_Wtime();
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
@@ -1858,6 +2398,26 @@ void bssnEM_class::Step(int lev, int YN)
|
||||
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
if (em_step_timing)
|
||||
{
|
||||
em_t_swap += MPI_Wtime() - em_t0;
|
||||
static int em_step_report_count = 0;
|
||||
const int em_timing_every = bssn_em_step_timing_every();
|
||||
const bool report_all_levels = bssn_em_step_timing_all_levels_enabled();
|
||||
if (lev == GH->levels - 1)
|
||||
++em_step_report_count;
|
||||
if ((report_all_levels || lev == GH->levels - 1) &&
|
||||
(em_timing_every <= 1 || em_step_report_count % em_timing_every == 0))
|
||||
{
|
||||
fprintf(stderr,
|
||||
"[AMSS-EM-STEP-TIMING] lev=%d wall=%.6f predictor=%.6f pre_sync=%.6f "
|
||||
"analysis=%.6f corrector=%.6f cor_sync=%.6f bh=%.6f swap=%.6f resident=%.6f rp=%.6f\n",
|
||||
lev, MPI_Wtime() - em_step_t0,
|
||||
em_t_predictor, em_t_predictor_sync,
|
||||
em_t_analysis, em_t_corrector, em_t_corrector_sync,
|
||||
em_t_bh, em_t_swap, em_t_resident, em_t_rp);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
@@ -2036,6 +2596,59 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
|
||||
|
||||
if (LastAnas >= AnasTime)
|
||||
{
|
||||
#if USE_CUDA_BSSN
|
||||
const bool zero_em_analysis =
|
||||
bssn_em_analysis_zero_fastpath_ready(GH->PatL[lev],
|
||||
#ifdef WithShell
|
||||
SH
|
||||
#else
|
||||
0
|
||||
#endif
|
||||
, myrank
|
||||
);
|
||||
#else
|
||||
const bool zero_em_analysis = false;
|
||||
#endif
|
||||
if (zero_em_analysis)
|
||||
{
|
||||
#if USE_CUDA_BSSN
|
||||
zero_em_analysis_outputs(GH->PatL[lev],
|
||||
#ifdef WithShell
|
||||
SH,
|
||||
#else
|
||||
0,
|
||||
#endif
|
||||
myrank,
|
||||
Rphi2, Iphi2, Rphi1, Iphi1);
|
||||
#endif
|
||||
int NN = 0;
|
||||
for (int pl = 1; pl < maxl + 1; pl++)
|
||||
for (int pm = -pl; pm < pl + 1; pm++)
|
||||
NN++;
|
||||
double *RP = new double[NN];
|
||||
double *IP = new double[NN];
|
||||
std::memset(RP, 0, NN * sizeof(double));
|
||||
std::memset(IP, 0, NN * sizeof(double));
|
||||
for (int i = 0; i < decn; i++)
|
||||
Phi2Monitor->writefile(PhysTime, NN, RP, IP);
|
||||
delete[] RP;
|
||||
delete[] IP;
|
||||
|
||||
NN = 0;
|
||||
for (int pl = 0; pl < maxl + 1; pl++)
|
||||
for (int pm = -pl; pm < pl + 1; pm++)
|
||||
NN++;
|
||||
RP = new double[NN];
|
||||
IP = new double[NN];
|
||||
std::memset(RP, 0, NN * sizeof(double));
|
||||
std::memset(IP, 0, NN * sizeof(double));
|
||||
for (int i = 0; i < decn; i++)
|
||||
Phi1Monitor->writefile(PhysTime, NN, RP, IP);
|
||||
delete[] RP;
|
||||
delete[] IP;
|
||||
}
|
||||
else
|
||||
{
|
||||
Compute_Phi2(lev);
|
||||
double *RP, *IP;
|
||||
int NN = 0;
|
||||
@@ -2125,6 +2738,7 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
|
||||
delete[] RP;
|
||||
delete[] IP;
|
||||
}
|
||||
}
|
||||
|
||||
AnalysisStuff(lev, dT_lev); // LastAnas need and only need control here
|
||||
|
||||
|
||||
@@ -438,7 +438,7 @@ int count_bssn_cuda_state_list(MyList<var> *vars)
|
||||
{
|
||||
++count;
|
||||
vars = vars->next;
|
||||
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
|
||||
if (count > BSSN_EM_CUDA_STATE_COUNT)
|
||||
return -1;
|
||||
}
|
||||
return count;
|
||||
@@ -449,8 +449,7 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
|
||||
double **host_views)
|
||||
{
|
||||
if (!cg || !host_views ||
|
||||
(state_count != BSSN_CUDA_STATE_COUNT &&
|
||||
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
|
||||
state_count <= 0 || state_count > BSSN_EM_CUDA_STATE_COUNT)
|
||||
return false;
|
||||
int idx = 0;
|
||||
while (vars && idx < state_count)
|
||||
@@ -742,7 +741,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
|
||||
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
|
||||
@@ -750,7 +749,9 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
}
|
||||
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
|
||||
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
|
||||
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
|
||||
: ((state_count == BSSN_CUDA_STATE_COUNT)
|
||||
? bssn_cuda_download_resident_state(cg, cg->shape, state_out)
|
||||
: bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count));
|
||||
if (rc)
|
||||
{
|
||||
cout << "CUDA resident state download failed" << endl;
|
||||
@@ -779,7 +780,7 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
|
||||
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
|
||||
{
|
||||
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -8,6 +8,8 @@ extern "C" {
|
||||
enum {
|
||||
BSSN_CUDA_STATE_COUNT = 24,
|
||||
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
|
||||
BSSN_EM_CUDA_STATE_COUNT = 32,
|
||||
BSSN_EM_CUDA_SOURCE_COUNT = 4,
|
||||
BSSN_CUDA_MATTER_COUNT = 10
|
||||
};
|
||||
|
||||
@@ -82,6 +84,28 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
|
||||
int &Lev,
|
||||
double &eps);
|
||||
|
||||
int bssn_em_cuda_rk4_substep(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double **state_host_in,
|
||||
double **state_host_out,
|
||||
double **source_host,
|
||||
const double *propspeed,
|
||||
const double *soa_flat,
|
||||
const double *bbox,
|
||||
double &dT,
|
||||
double &T,
|
||||
int &RK4,
|
||||
int &apply_bam_bc,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &co,
|
||||
int &keep_resident_state,
|
||||
int &apply_enforce_ga,
|
||||
double &chitiny);
|
||||
|
||||
int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
|
||||
|
||||
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
||||
int state_index,
|
||||
double *host_state,
|
||||
|
||||
@@ -150,8 +150,11 @@ def _gpu_runtime_env():
|
||||
"AMSS_CUDA_KEEP_ALL_LEVELS": "1",
|
||||
"AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1",
|
||||
"AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1",
|
||||
"AMSS_CUDA_EM_CACHE_SOURCES": "1",
|
||||
"AMSS_CUDA_EM_ZERO_FASTPATH": "1",
|
||||
"AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1",
|
||||
"AMSS_CUDA_AMR_HOST_STAGED": "1",
|
||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
|
||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "0",
|
||||
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
|
||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
|
||||
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
|
||||
@@ -287,6 +290,9 @@ def run_ABE():
|
||||
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
|
||||
print(f" AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}")
|
||||
print(f" AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}")
|
||||
print(f" AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}")
|
||||
print(f" AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}")
|
||||
print(f" AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}")
|
||||
print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
|
||||
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
|
||||
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")
|
||||
|
||||
Reference in New Issue
Block a user