Add EM GPU fast paths and defaults

This commit is contained in:
2026-05-07 12:18:56 +08:00
parent dd0e20d8c7
commit cb911dec06
6 changed files with 1720 additions and 183 deletions

View File

@@ -18,7 +18,7 @@
#endif
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_EM_CUDA_STATE_COUNT
#endif
#if USE_CUDA_Z4C
#include "z4c_rhs_cuda.h"
@@ -181,8 +181,7 @@ bool cuda_build_bssn_host_views(Block *block,
double **views)
{
if (!block || !vars || !views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
MyList<var> *v = vars;
for (int i = 0; i < state_count; ++i)
@@ -200,8 +199,7 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
double *soa_flat)
{
if (!vars || !soa_flat ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
return false;
MyList<var> *v = vars;
for (int i = 0; i < state_count; ++i)
@@ -322,7 +320,7 @@ bool cuda_state_count_direct_supported(int state_count)
#if USE_CUDA_Z4C && (ABEtype == 2)
return state_count == Z4C_CUDA_STATE_COUNT;
#elif USE_CUDA_BSSN
return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
#else
(void)state_count;
return false;
@@ -550,7 +548,8 @@ bool cuda_uncached_device_buffers_enabled(int state_count)
}
if (!enabled)
return false;
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)
if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT &&
state_count != BSSN_EM_CUDA_STATE_COUNT)
return false;
return cuda_aware_mpi_enabled();
#else
@@ -6136,6 +6135,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache)
{
const double t_transfer = sync_profile_enabled() ? MPI_Wtime() : 0.0;
int myrank;
MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
@@ -6324,6 +6324,13 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
else
data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
}
if (sync_profile_enabled())
{
SyncProfileStats &stats = sync_profile_stats();
stats.finish_calls++;
stats.finish_sec += MPI_Wtime() - t_transfer;
sync_profile_maybe_log();
}
}
void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
{

View File

@@ -2,6 +2,7 @@
#ifdef newc
#include <sstream>
#include <cstdio>
#include <cstdlib>
#include <map>
using namespace std;
#else
@@ -10,6 +11,7 @@ using namespace std;
#endif
#include <time.h>
#include <cstring>
#include "macrodef.h"
#include "misc.h"
@@ -19,6 +21,9 @@ using namespace std;
#include "bssnEM_class.h"
#include "bssn_rhs.h"
#include "empart.h"
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#endif
#include "initial_puncture.h"
#include "initial_maxwell.h"
#include "enforce_algebra.h"
@@ -36,6 +41,363 @@ using namespace std;
//================================================================================================
namespace
{
MyList<var> *advance_var_list(MyList<var> *vars, int count)
{
while (vars && count > 0)
{
vars = vars->next;
--count;
}
return vars;
}
bool bssn_em_step_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_EM_STEP_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_em_step_timing_all_levels_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_EM_STEP_TIMING_ALL_LEVELS");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
#if USE_CUDA_BSSN
bool bssn_em_zero_analysis_fastpath_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_EM_ZERO_ANALYSIS_FASTPATH");
enabled = (!env || atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool bssn_em_analysis_zero_fastpath_ready(MyList<Patch> *PatL,
#ifdef WithShell
ShellPatch *shell,
#else
ShellPatch * /*shell*/,
#endif
int rank)
{
if (!bssn_em_zero_analysis_fastpath_enabled())
return false;
int local_ok = 1;
int local_seen = 0;
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (rank == cg->rank)
{
local_seen = 1;
if (!bssn_em_cuda_resident_zero_fast_state(cg))
local_ok = 0;
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
#ifdef WithShell
if (shell && shell->PatL)
{
MyList<ss_patch> *SP = shell->PatL;
while (SP)
{
MyList<Block> *BP = SP->data->blb;
while (BP)
{
Block *cg = BP->data;
if (rank == cg->rank)
{
local_seen = 1;
if (!bssn_em_cuda_resident_zero_fast_state(cg))
local_ok = 0;
}
if (BP == SP->data->ble)
break;
BP = BP->next;
}
SP = SP->next;
}
}
#endif
int global_ok = 0;
int global_seen = 0;
MPI_Allreduce(&local_ok, &global_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
MPI_Allreduce(&local_seen, &global_seen, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
return global_seen && global_ok;
}
void zero_em_analysis_outputs(MyList<Patch> *PatL,
#ifdef WithShell
ShellPatch *shell,
#else
ShellPatch * /*shell*/,
#endif
int rank,
var *Rphi2_var, var *Iphi2_var,
var *Rphi1_var, var *Iphi1_var)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (rank == cg->rank)
{
const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
#ifdef WithShell
if (shell && shell->PatL)
{
MyList<ss_patch> *SP = shell->PatL;
while (SP)
{
MyList<Block> *BP = SP->data->blb;
while (BP)
{
Block *cg = BP->data;
if (rank == cg->rank)
{
const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
}
if (BP == SP->data->ble)
break;
BP = BP->next;
}
SP = SP->next;
}
}
#endif
}
#endif
int bssn_em_step_timing_every()
{
static int every = -1;
if (every < 0)
{
const char *env = getenv("AMSS_EM_STEP_TIMING_EVERY");
every = (env && atoi(env) > 0) ? atoi(env) : 1;
}
return every;
}
#if USE_CUDA_BSSN
bool fill_bssn_em_bssn_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = 0,
double *soa_flat = 0)
{
int idx = 0;
while (vars && idx < BSSN_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
if (propspeeds)
propspeeds[idx] = vars->data->propspeed;
if (soa_flat)
{
soa_flat[3 * idx + 0] = vars->data->SoA[0];
soa_flat[3 * idx + 1] = vars->data->SoA[1];
soa_flat[3 * idx + 2] = vars->data->SoA[2];
}
vars = vars->next;
++idx;
}
return idx == BSSN_CUDA_STATE_COUNT;
}
bool fill_bssn_em_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = 0,
double *soa_flat = 0)
{
int idx = 0;
while (vars && idx < BSSN_EM_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
if (propspeeds)
propspeeds[idx] = vars->data->propspeed;
if (soa_flat)
{
soa_flat[3 * idx + 0] = vars->data->SoA[0];
soa_flat[3 * idx + 1] = vars->data->SoA[1];
soa_flat[3 * idx + 2] = vars->data->SoA[2];
}
vars = vars->next;
++idx;
}
return idx == BSSN_EM_CUDA_STATE_COUNT && vars == 0;
}
void fill_bssn_em_fixed_source_cuda_views(Block *cg, double **sources,
var *Jx, var *Jy, var *Jz, var *qchar)
{
sources[0] = cg->fgfs[Jx->sgfn];
sources[1] = cg->fgfs[Jy->sgfn];
sources[2] = cg->fgfs[Jz->sgfn];
sources[3] = cg->fgfs[qchar->sgfn];
}
void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
var *rho, var *Sx, var *Sy, var *Sz,
var *Sxx, var *Sxy, var *Sxz,
var *Syy, var *Syz, var *Szz)
{
matter[0] = cg->fgfs[rho->sgfn];
matter[1] = cg->fgfs[Sx->sgfn];
matter[2] = cg->fgfs[Sy->sgfn];
matter[3] = cg->fgfs[Sz->sgfn];
matter[4] = cg->fgfs[Sxx->sgfn];
matter[5] = cg->fgfs[Sxy->sgfn];
matter[6] = cg->fgfs[Sxz->sgfn];
matter[7] = cg->fgfs[Syy->sgfn];
matter[8] = cg->fgfs[Syz->sgfn];
matter[9] = cg->fgfs[Szz->sgfn];
}
bool bssn_em_cuda_use_resident_sync(int lev)
{
#ifdef WithShell
(void)lev;
return false;
#else
return true;
#endif
}
bool bssn_em_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
{
static int keep_all_levels = -1;
if (keep_all_levels < 0)
{
const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS");
keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
}
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
if (!enabled)
return false;
if (lev == analysis_lev)
return false;
if (keep_all_levels)
return true;
return lev < trfls_in;
}
void bssn_em_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank, bool release_ctx)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
if (!fill_bssn_em_cuda_views(cg, vars, state_out))
{
cout << "CUDA BSSN-EM resident state list mismatch during download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape,
state_out,
BSSN_EM_CUDA_STATE_COUNT))
{
cout << "CUDA BSSN-EM resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (release_ctx)
bssn_cuda_release_step_ctx(cg);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
void bssn_em_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
int myrank)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_key[BSSN_EM_CUDA_STATE_COUNT];
if (!fill_bssn_em_cuda_views(cg, vars, state_key))
{
cout << "CUDA BSSN-EM resident state list mismatch during prune" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (bssn_cuda_keep_only_resident_state_count(cg, cg->shape,
state_key,
BSSN_EM_CUDA_STATE_COUNT))
{
cout << "CUDA BSSN-EM keep-only resident state failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
#endif
}
// Define bssnEM_class
// It inherits some members and methods from the parent class bssn_class and modifies others.
@@ -258,6 +620,13 @@ void bssnEM_class::Initialize()
PhysTime = StartTime;
Setup_Black_Hole_position();
}
sync_cache_pre = new Parallel::SyncCache[GH->levels];
sync_cache_cor = new Parallel::SyncCache[GH->levels];
sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
sync_cache_restrict = new Parallel::SyncCache[GH->levels];
sync_cache_outbd = new Parallel::SyncCache[GH->levels];
}
//================================================================================================
@@ -833,9 +1202,25 @@ void bssnEM_class::Step(int lev, int YN)
int iter_count = 0; // count RK4 substeps
int pre = 0, cor = 1;
int ERROR = 0;
#if USE_CUDA_BSSN
const bool use_cuda_resident_sync = bssn_em_cuda_use_resident_sync(lev);
#endif
const bool em_step_timing = bssn_em_step_timing_enabled();
const double em_step_t0 = em_step_timing ? MPI_Wtime() : 0.0;
double em_t0 = 0.0;
double em_t_predictor = 0.0;
double em_t_predictor_sync = 0.0;
double em_t_corrector = 0.0;
double em_t_corrector_sync = 0.0;
double em_t_analysis = 0.0;
double em_t_bh = 0.0;
double em_t_swap = 0.0;
double em_t_resident = 0.0;
double em_t_rp = 0.0;
MyList<ss_patch> *sPp;
// Predictor
em_t0 = em_step_timing ? MPI_Wtime() : 0.0;
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -845,15 +1230,20 @@ void bssnEM_class::Step(int lev, int YN)
Block *cg = BP->data;
if (myrank == cg->rank)
{
#if !USE_CUDA_BSSN
#if (AGM == 0)
f_enforce_ga(cg->shape,
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
#endif
if (
int em_rhs_error = 0;
bool used_gpu_substep = false;
#if !USE_CUDA_BSSN
em_rhs_error =
f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
@@ -873,7 +1263,51 @@ void bssnEM_class::Step(int lev, int YN)
cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
Symmetry, lev, ndeps) ||
Symmetry, lev, ndeps);
#endif
#if USE_CUDA_BSSN
if (!em_rhs_error)
{
double *state_in[BSSN_EM_CUDA_STATE_COUNT];
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
double propspeed[BSSN_EM_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
if (!fill_bssn_em_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
!fill_bssn_em_cuda_views(cg, SynchList_pre, state_out))
{
cout << "CUDA BSSN-EM state list mismatch on predictor step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
int apply_bam_bc = 0;
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#endif
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
if (bssn_em_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out, sources,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, pre,
keep_resident_state, apply_enforce_ga, chitiny))
{
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (em_rhs_error ||
(!used_gpu_substep &&
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
@@ -907,7 +1341,7 @@ void bssnEM_class::Step(int lev, int YN)
cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, pre))
Symmetry, lev, ndeps, pre)))
{
cout << "find NaN in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
@@ -916,6 +1350,8 @@ void bssnEM_class::Step(int lev, int YN)
ERROR = 1;
}
if (!used_gpu_substep)
{
// rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList;
@@ -958,12 +1394,15 @@ void bssnEM_class::Step(int lev, int YN)
}
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (em_step_timing)
em_t_predictor += MPI_Wtime() - em_t0;
// check error information
{
int erh = ERROR;
@@ -1221,7 +1660,11 @@ void bssnEM_class::Step(int lev, int YN)
}
#endif
if (em_step_timing)
em_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
if (em_step_timing)
em_t_predictor_sync += MPI_Wtime() - em_t0;
#ifdef WithShell
if (lev == 0)
@@ -1244,6 +1687,8 @@ void bssnEM_class::Step(int lev, int YN)
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
if (em_step_timing)
em_t0 = MPI_Wtime();
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
@@ -1272,16 +1717,24 @@ void bssnEM_class::Step(int lev, int YN)
DG_List->clearList();
}
}
if (em_step_timing)
em_t_bh += MPI_Wtime() - em_t0;
}
// data analysis part
// Warning NOTE: the variables1 are used as temp storege room
if (lev == a_lev)
{
if (em_step_timing)
em_t0 = MPI_Wtime();
AnalysisStuff_EM(lev, dT_lev);
if (em_step_timing)
em_t_analysis += MPI_Wtime() - em_t0;
}
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
if (em_step_timing)
em_t0 = MPI_Wtime();
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
@@ -1294,6 +1747,7 @@ void bssnEM_class::Step(int lev, int YN)
Block *cg = BP->data;
if (myrank == cg->rank)
{
#if !USE_CUDA_BSSN
#if (AGM == 0)
f_enforce_ga(cg->shape,
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
@@ -1307,9 +1761,13 @@ void bssnEM_class::Step(int lev, int YN)
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif
#endif
if (
int em_rhs_error = 0;
bool used_gpu_substep = false;
#if !USE_CUDA_BSSN
em_rhs_error =
f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
@@ -1329,7 +1787,54 @@ void bssnEM_class::Step(int lev, int YN)
cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn],
cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
Symmetry, lev, ndeps) ||
Symmetry, lev, ndeps);
#endif
#if USE_CUDA_BSSN
if (!em_rhs_error)
{
double *state_in[BSSN_EM_CUDA_STATE_COUNT];
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
double propspeed[BSSN_EM_CUDA_STATE_COUNT];
double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
if (!fill_bssn_em_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
!fill_bssn_em_cuda_views(cg, SynchList_cor, state_out))
{
cout << "CUDA BSSN-EM state list mismatch on corrector step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
int apply_bam_bc = 0;
#if (SommerType == 0)
#ifndef WithShell
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#endif
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#elif (AGM == 1)
if (iter_count == 3)
apply_enforce_ga = 1;
#endif
int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
if (bssn_em_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out, sources,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, cor,
keep_resident_state, apply_enforce_ga, chitiny))
{
ERROR = 1;
}
used_gpu_substep = true;
}
#endif
if (em_rhs_error ||
(!used_gpu_substep &&
f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
@@ -1362,7 +1867,7 @@ void bssnEM_class::Step(int lev, int YN)
cg->fgfs[Cons_Ham->sgfn],
cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
Symmetry, lev, ndeps, cor))
Symmetry, lev, ndeps, cor)))
{
cout << "find NaN in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
@@ -1370,6 +1875,8 @@ void bssnEM_class::Step(int lev, int YN)
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
if (!used_gpu_substep)
{
// rk4 substep and boundary
{
MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList;
@@ -1413,6 +1920,7 @@ void bssnEM_class::Step(int lev, int YN)
}
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
@@ -1683,7 +2191,13 @@ void bssnEM_class::Step(int lev, int YN)
}
#endif
if (em_step_timing)
em_t_corrector += MPI_Wtime() - em_t0;
if (em_step_timing)
em_t0 = MPI_Wtime();
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
if (em_step_timing)
em_t_corrector_sync += MPI_Wtime() - em_t0;
#ifdef WithShell
if (lev == 0)
@@ -1705,6 +2219,8 @@ void bssnEM_class::Step(int lev, int YN)
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
if (em_step_timing)
em_t0 = MPI_Wtime();
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
@@ -1733,10 +2249,14 @@ void bssnEM_class::Step(int lev, int YN)
DG_List->clearList();
}
}
if (em_step_timing)
em_t_bh += MPI_Wtime() - em_t0;
}
// swap time level
if (iter_count < 3)
{
if (em_step_timing)
em_t0 = MPI_Wtime();
Pp = GH->PatL[lev];
while (Pp)
{
@@ -1780,12 +2300,30 @@ void bssnEM_class::Step(int lev, int YN)
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
if (em_step_timing)
em_t_swap += MPI_Wtime() - em_t0;
}
}
#if USE_CUDA_BSSN
if (use_cuda_resident_sync)
{
if (em_step_timing)
em_t0 = MPI_Wtime();
if (!bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev))
bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
if (em_step_timing)
em_t_resident += MPI_Wtime() - em_t0;
}
#endif
#if (RPS == 0)
// mesh refinement boundary part
if (em_step_timing)
em_t0 = MPI_Wtime();
RestrictProlong(lev, YN, BB);
if (em_step_timing)
em_t_rp += MPI_Wtime() - em_t0;
#ifdef WithShell
if (lev == 0)
@@ -1813,6 +2351,8 @@ void bssnEM_class::Step(int lev, int YN)
//
// OldStateList old -----------
// update
if (em_step_timing)
em_t0 = MPI_Wtime();
Pp = GH->PatL[lev];
while (Pp)
{
@@ -1858,6 +2398,26 @@ void bssnEM_class::Step(int lev, int YN)
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
if (em_step_timing)
{
em_t_swap += MPI_Wtime() - em_t0;
static int em_step_report_count = 0;
const int em_timing_every = bssn_em_step_timing_every();
const bool report_all_levels = bssn_em_step_timing_all_levels_enabled();
if (lev == GH->levels - 1)
++em_step_report_count;
if ((report_all_levels || lev == GH->levels - 1) &&
(em_timing_every <= 1 || em_step_report_count % em_timing_every == 0))
{
fprintf(stderr,
"[AMSS-EM-STEP-TIMING] lev=%d wall=%.6f predictor=%.6f pre_sync=%.6f "
"analysis=%.6f corrector=%.6f cor_sync=%.6f bh=%.6f swap=%.6f resident=%.6f rp=%.6f\n",
lev, MPI_Wtime() - em_step_t0,
em_t_predictor, em_t_predictor_sync,
em_t_analysis, em_t_corrector, em_t_corrector_sync,
em_t_bh, em_t_swap, em_t_resident, em_t_rp);
}
}
}
//================================================================================================
@@ -2036,6 +2596,59 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
if (LastAnas >= AnasTime)
{
#if USE_CUDA_BSSN
const bool zero_em_analysis =
bssn_em_analysis_zero_fastpath_ready(GH->PatL[lev],
#ifdef WithShell
SH
#else
0
#endif
, myrank
);
#else
const bool zero_em_analysis = false;
#endif
if (zero_em_analysis)
{
#if USE_CUDA_BSSN
zero_em_analysis_outputs(GH->PatL[lev],
#ifdef WithShell
SH,
#else
0,
#endif
myrank,
Rphi2, Iphi2, Rphi1, Iphi1);
#endif
int NN = 0;
for (int pl = 1; pl < maxl + 1; pl++)
for (int pm = -pl; pm < pl + 1; pm++)
NN++;
double *RP = new double[NN];
double *IP = new double[NN];
std::memset(RP, 0, NN * sizeof(double));
std::memset(IP, 0, NN * sizeof(double));
for (int i = 0; i < decn; i++)
Phi2Monitor->writefile(PhysTime, NN, RP, IP);
delete[] RP;
delete[] IP;
NN = 0;
for (int pl = 0; pl < maxl + 1; pl++)
for (int pm = -pl; pm < pl + 1; pm++)
NN++;
RP = new double[NN];
IP = new double[NN];
std::memset(RP, 0, NN * sizeof(double));
std::memset(IP, 0, NN * sizeof(double));
for (int i = 0; i < decn; i++)
Phi1Monitor->writefile(PhysTime, NN, RP, IP);
delete[] RP;
delete[] IP;
}
else
{
Compute_Phi2(lev);
double *RP, *IP;
int NN = 0;
@@ -2125,6 +2738,7 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
delete[] RP;
delete[] IP;
}
}
AnalysisStuff(lev, dT_lev); // LastAnas need and only need control here

View File

@@ -438,7 +438,7 @@ int count_bssn_cuda_state_list(MyList<var> *vars)
{
++count;
vars = vars->next;
if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
if (count > BSSN_EM_CUDA_STATE_COUNT)
return -1;
}
return count;
@@ -449,8 +449,7 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
double **host_views)
{
if (!cg || !host_views ||
(state_count != BSSN_CUDA_STATE_COUNT &&
state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
state_count <= 0 || state_count > BSSN_EM_CUDA_STATE_COUNT)
return false;
int idx = 0;
while (vars && idx < state_count)
@@ -742,7 +741,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{
cout << "CUDA BSSN state list mismatch on resident state download" << endl;
@@ -750,7 +749,9 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
}
const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
: bssn_cuda_download_resident_state(cg, cg->shape, state_out);
: ((state_count == BSSN_CUDA_STATE_COUNT)
? bssn_cuda_download_resident_state(cg, cg->shape, state_out)
: bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count));
if (rc)
{
cout << "CUDA resident state download failed" << endl;
@@ -779,7 +780,7 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
Block *cg = BP->data;
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
{
double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
double *state_out[BSSN_EM_CUDA_STATE_COUNT];
if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
{
cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;

File diff suppressed because it is too large Load Diff

View File

@@ -8,6 +8,8 @@ extern "C" {
enum {
BSSN_CUDA_STATE_COUNT = 24,
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
BSSN_EM_CUDA_STATE_COUNT = 32,
BSSN_EM_CUDA_SOURCE_COUNT = 4,
BSSN_CUDA_MATTER_COUNT = 10
};
@@ -82,6 +84,28 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
int &Lev,
double &eps);
int bssn_em_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
double **source_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index,
double *host_state,

View File

@@ -150,8 +150,11 @@ def _gpu_runtime_env():
"AMSS_CUDA_KEEP_ALL_LEVELS": "1",
"AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1",
"AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1",
"AMSS_CUDA_EM_CACHE_SOURCES": "1",
"AMSS_CUDA_EM_ZERO_FASTPATH": "1",
"AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1",
"AMSS_CUDA_AMR_HOST_STAGED": "1",
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
"AMSS_CUDA_AMR_RESTRICT_DEVICE": "0",
"AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
"AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
@@ -287,6 +290,9 @@ def run_ABE():
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
print(f" AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}")
print(f" AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}")
print(f" AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}")
print(f" AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}")
print(f" AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}")
print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")