From cb911dec06b6a68101d8f66c6c26cc72ad58aabb Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 7 May 2026 12:18:56 +0800 Subject: [PATCH] Add EM GPU fast paths and defaults --- AMSS_NCKU_source/Parallel.C | 21 +- AMSS_NCKU_source/bssnEM_class.C | 906 ++++++++++++++++++++++++----- AMSS_NCKU_source/bssn_class.C | 13 +- AMSS_NCKU_source/bssn_rhs_cuda.cu | 931 +++++++++++++++++++++++++++++- AMSS_NCKU_source/bssn_rhs_cuda.h | 24 + makefile_and_run.py | 8 +- 6 files changed, 1720 insertions(+), 183 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index bc37930..b7f2a51 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -18,7 +18,7 @@ #endif #if USE_CUDA_BSSN #include "bssn_rhs_cuda.h" -#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT +#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_EM_CUDA_STATE_COUNT #endif #if USE_CUDA_Z4C #include "z4c_rhs_cuda.h" @@ -181,8 +181,7 @@ bool cuda_build_bssn_host_views(Block *block, double **views) { if (!block || !vars || !views || - (state_count != BSSN_CUDA_STATE_COUNT && - state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)) + state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; MyList *v = vars; for (int i = 0; i < state_count; ++i) @@ -200,8 +199,7 @@ bool cuda_build_bssn_soa(MyList *vars, double *soa_flat) { if (!vars || !soa_flat || - (state_count != BSSN_CUDA_STATE_COUNT && - state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)) + state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; MyList *v = vars; for (int i = 0; i < state_count; ++i) @@ -322,7 +320,7 @@ bool cuda_state_count_direct_supported(int state_count) #if USE_CUDA_Z4C && (ABEtype == 2) return state_count == Z4C_CUDA_STATE_COUNT; #elif USE_CUDA_BSSN - return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT; + return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT; #else (void)state_count; return false; @@ -550,7 +548,8 @@ bool cuda_uncached_device_buffers_enabled(int state_count) } if (!enabled) return false; - if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT) + if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT && + state_count != BSSN_EM_CUDA_STATE_COUNT) return false; return cuda_aware_mpi_enabled(); #else @@ -6136,6 +6135,7 @@ void Parallel::transfer_cached(MyList **src, MyList *VarList1, MyList *VarList2, int Symmetry, SyncCache &cache) { + const double t_transfer = sync_profile_enabled() ? MPI_Wtime() : 0.0; int myrank; MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); @@ -6324,6 +6324,13 @@ void Parallel::transfer_cached(MyList **src, MyList *PatL, int Symmetry, SyncCache &cache) { diff --git a/AMSS_NCKU_source/bssnEM_class.C b/AMSS_NCKU_source/bssnEM_class.C index 0a9e199..763cf0f 100644 --- a/AMSS_NCKU_source/bssnEM_class.C +++ b/AMSS_NCKU_source/bssnEM_class.C @@ -1,25 +1,30 @@ #ifdef newc -#include -#include -#include +#include +#include +#include +#include using namespace std; #else #include #include #endif -#include +#include +#include #include "macrodef.h" #include "misc.h" #include "Ansorg.h" #include "fmisc.h" #include "Parallel.h" -#include "bssnEM_class.h" -#include "bssn_rhs.h" -#include "empart.h" -#include "initial_puncture.h" +#include "bssnEM_class.h" +#include "bssn_rhs.h" +#include "empart.h" +#if USE_CUDA_BSSN +#include "bssn_rhs_cuda.h" +#endif +#include "initial_puncture.h" #include "initial_maxwell.h" #include "enforce_algebra.h" #include "rungekutta4_rout.h" @@ -34,11 +39,368 @@ using namespace std; #include "myglobal.h" #endif -//================================================================================================ - -// Define bssnEM_class - -// It inherits some members and methods from the parent class bssn_class and modifies others. +//================================================================================================ + +namespace +{ +MyList *advance_var_list(MyList *vars, int count) +{ + while (vars && count > 0) + { + vars = vars->next; + --count; + } + return vars; +} + +bool bssn_em_step_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_EM_STEP_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool bssn_em_step_timing_all_levels_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_EM_STEP_TIMING_ALL_LEVELS"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +#if USE_CUDA_BSSN +bool bssn_em_zero_analysis_fastpath_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_EM_ZERO_ANALYSIS_FASTPATH"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool bssn_em_analysis_zero_fastpath_ready(MyList *PatL, +#ifdef WithShell + ShellPatch *shell, +#else + ShellPatch * /*shell*/, +#endif + int rank) +{ + if (!bssn_em_zero_analysis_fastpath_enabled()) + return false; + int local_ok = 1; + int local_seen = 0; + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (rank == cg->rank) + { + local_seen = 1; + if (!bssn_em_cuda_resident_zero_fast_state(cg)) + local_ok = 0; + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +#ifdef WithShell + if (shell && shell->PatL) + { + MyList *SP = shell->PatL; + while (SP) + { + MyList *BP = SP->data->blb; + while (BP) + { + Block *cg = BP->data; + if (rank == cg->rank) + { + local_seen = 1; + if (!bssn_em_cuda_resident_zero_fast_state(cg)) + local_ok = 0; + } + if (BP == SP->data->ble) + break; + BP = BP->next; + } + SP = SP->next; + } + } +#endif + int global_ok = 0; + int global_seen = 0; + MPI_Allreduce(&local_ok, &global_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD); + MPI_Allreduce(&local_seen, &global_seen, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD); + return global_seen && global_ok; +} + +void zero_em_analysis_outputs(MyList *PatL, +#ifdef WithShell + ShellPatch *shell, +#else + ShellPatch * /*shell*/, +#endif + int rank, + var *Rphi2_var, var *Iphi2_var, + var *Rphi1_var, var *Iphi1_var) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (rank == cg->rank) + { + const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2]; + std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double)); + std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double)); + std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double)); + std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double)); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +#ifdef WithShell + if (shell && shell->PatL) + { + MyList *SP = shell->PatL; + while (SP) + { + MyList *BP = SP->data->blb; + while (BP) + { + Block *cg = BP->data; + if (rank == cg->rank) + { + const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2]; + std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double)); + std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double)); + std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double)); + std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double)); + } + if (BP == SP->data->ble) + break; + BP = BP->next; + } + SP = SP->next; + } + } +#endif +} +#endif + +int bssn_em_step_timing_every() +{ + static int every = -1; + if (every < 0) + { + const char *env = getenv("AMSS_EM_STEP_TIMING_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 1; + } + return every; +} + +#if USE_CUDA_BSSN +bool fill_bssn_em_bssn_cuda_views(Block *cg, MyList *vars, + double **host_views, + double *propspeeds = 0, + double *soa_flat = 0) +{ + int idx = 0; + while (vars && idx < BSSN_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + if (propspeeds) + propspeeds[idx] = vars->data->propspeed; + if (soa_flat) + { + soa_flat[3 * idx + 0] = vars->data->SoA[0]; + soa_flat[3 * idx + 1] = vars->data->SoA[1]; + soa_flat[3 * idx + 2] = vars->data->SoA[2]; + } + vars = vars->next; + ++idx; + } + return idx == BSSN_CUDA_STATE_COUNT; +} + +bool fill_bssn_em_cuda_views(Block *cg, MyList *vars, + double **host_views, + double *propspeeds = 0, + double *soa_flat = 0) +{ + int idx = 0; + while (vars && idx < BSSN_EM_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + if (propspeeds) + propspeeds[idx] = vars->data->propspeed; + if (soa_flat) + { + soa_flat[3 * idx + 0] = vars->data->SoA[0]; + soa_flat[3 * idx + 1] = vars->data->SoA[1]; + soa_flat[3 * idx + 2] = vars->data->SoA[2]; + } + vars = vars->next; + ++idx; + } + return idx == BSSN_EM_CUDA_STATE_COUNT && vars == 0; +} + +void fill_bssn_em_fixed_source_cuda_views(Block *cg, double **sources, + var *Jx, var *Jy, var *Jz, var *qchar) +{ + sources[0] = cg->fgfs[Jx->sgfn]; + sources[1] = cg->fgfs[Jy->sgfn]; + sources[2] = cg->fgfs[Jz->sgfn]; + sources[3] = cg->fgfs[qchar->sgfn]; +} + +void fill_bssn_em_matter_cuda_views(Block *cg, double **matter, + var *rho, var *Sx, var *Sy, var *Sz, + var *Sxx, var *Sxy, var *Sxz, + var *Syy, var *Syz, var *Szz) +{ + matter[0] = cg->fgfs[rho->sgfn]; + matter[1] = cg->fgfs[Sx->sgfn]; + matter[2] = cg->fgfs[Sy->sgfn]; + matter[3] = cg->fgfs[Sz->sgfn]; + matter[4] = cg->fgfs[Sxx->sgfn]; + matter[5] = cg->fgfs[Sxy->sgfn]; + matter[6] = cg->fgfs[Sxz->sgfn]; + matter[7] = cg->fgfs[Syy->sgfn]; + matter[8] = cg->fgfs[Syz->sgfn]; + matter[9] = cg->fgfs[Szz->sgfn]; +} + +bool bssn_em_cuda_use_resident_sync(int lev) +{ +#ifdef WithShell + (void)lev; + return false; +#else + return true; +#endif +} + +bool bssn_em_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) +{ + static int keep_all_levels = -1; + if (keep_all_levels < 0) + { + const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS"); + keep_all_levels = (env && atoi(env) != 0) ? 1 : 0; + } + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + if (!enabled) + return false; + if (lev == analysis_lev) + return false; + if (keep_all_levels) + return true; + return lev < trfls_in; +} + +void bssn_em_cuda_download_level_state(MyList *PatL, MyList *vars, + int myrank, bool release_ctx) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + { + double *state_out[BSSN_EM_CUDA_STATE_COUNT]; + if (!fill_bssn_em_cuda_views(cg, vars, state_out)) + { + cout << "CUDA BSSN-EM resident state list mismatch during download" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, + state_out, + BSSN_EM_CUDA_STATE_COUNT)) + { + cout << "CUDA BSSN-EM resident state download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (release_ctx) + bssn_cuda_release_step_ctx(cg); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} + +void bssn_em_cuda_keep_only_level_state(MyList *PatL, MyList *vars, + int myrank) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + { + double *state_key[BSSN_EM_CUDA_STATE_COUNT]; + if (!fill_bssn_em_cuda_views(cg, vars, state_key)) + { + cout << "CUDA BSSN-EM resident state list mismatch during prune" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_cuda_keep_only_resident_state_count(cg, cg->shape, + state_key, + BSSN_EM_CUDA_STATE_COUNT)) + { + cout << "CUDA BSSN-EM keep-only resident state failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} +#endif +} + +// Define bssnEM_class + +// It inherits some members and methods from the parent class bssn_class and modifies others. // The modified members and methods are defined below (and in the header bssnEM_class.h). // The remaining members are inherited from the parent class bssn_class (declared in bssn_class.h). @@ -253,12 +615,19 @@ void bssnEM_class::Initialize() { CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs); } - else - { - PhysTime = StartTime; - Setup_Black_Hole_position(); - } -} + else + { + PhysTime = StartTime; + Setup_Black_Hole_position(); + } + + sync_cache_pre = new Parallel::SyncCache[GH->levels]; + sync_cache_cor = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; +} //================================================================================================ @@ -829,14 +1198,30 @@ void bssnEM_class::Step(int lev, int YN) double ndeps = numepss; if (lev < GH->movls) ndeps = numepsb; - double TRK4 = PhysTime; - int iter_count = 0; // count RK4 substeps - int pre = 0, cor = 1; - int ERROR = 0; - - MyList *sPp; - // Predictor - MyList *Pp = GH->PatL[lev]; + double TRK4 = PhysTime; + int iter_count = 0; // count RK4 substeps + int pre = 0, cor = 1; + int ERROR = 0; +#if USE_CUDA_BSSN + const bool use_cuda_resident_sync = bssn_em_cuda_use_resident_sync(lev); +#endif + const bool em_step_timing = bssn_em_step_timing_enabled(); + const double em_step_t0 = em_step_timing ? MPI_Wtime() : 0.0; + double em_t0 = 0.0; + double em_t_predictor = 0.0; + double em_t_predictor_sync = 0.0; + double em_t_corrector = 0.0; + double em_t_corrector_sync = 0.0; + double em_t_analysis = 0.0; + double em_t_bh = 0.0; + double em_t_swap = 0.0; + double em_t_resident = 0.0; + double em_t_rp = 0.0; + + MyList *sPp; + // Predictor + em_t0 = em_step_timing ? MPI_Wtime() : 0.0; + MyList *Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -845,16 +1230,21 @@ void bssnEM_class::Step(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], - cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); -#endif +#if !USE_CUDA_BSSN +#if (AGM == 0) + f_enforce_ga(cg->shape, + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); +#endif +#endif - if ( - f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2], + int em_rhs_error = 0; + bool used_gpu_substep = false; +#if !USE_CUDA_BSSN + em_rhs_error = + f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], @@ -871,10 +1261,54 @@ void bssnEM_class::Step(int lev, int YN) cg->fgfs[Kpsi_rhs->sgfn], cg->fgfs[Kphi_rhs->sgfn], cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], - cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], - cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], - Symmetry, lev, ndeps) || - f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], + cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], + Symmetry, lev, ndeps); +#endif + +#if USE_CUDA_BSSN + if (!em_rhs_error) + { + double *state_in[BSSN_EM_CUDA_STATE_COUNT]; + double *state_out[BSSN_EM_CUDA_STATE_COUNT]; + double *sources[BSSN_EM_CUDA_SOURCE_COUNT]; + double propspeed[BSSN_EM_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT]; + if (!fill_bssn_em_cuda_views(cg, StateList, state_in, propspeed, soa_flat) || + !fill_bssn_em_cuda_views(cg, SynchList_pre, state_out)) + { + cout << "CUDA BSSN-EM state list mismatch on predictor step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar); + int apply_bam_bc = 0; +#if (SommerType == 0) +#ifndef WithShell + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#endif + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#endif + int keep_resident_state = use_cuda_resident_sync ? 1 : 0; + if (bssn_em_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, sources, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, pre, + keep_resident_state, apply_enforce_ga, chitiny)) + { + ERROR = 1; + } + used_gpu_substep = true; + } +#endif + + if (em_rhs_error || + (!used_gpu_substep && + f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], @@ -907,7 +1341,7 @@ void bssnEM_class::Step(int lev, int YN) cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], - Symmetry, lev, ndeps, pre)) + Symmetry, lev, ndeps, pre))) { cout << "find NaN in domain: (" << cg->bbox[0] << ":" << cg->bbox[3] << "," @@ -916,9 +1350,11 @@ void bssnEM_class::Step(int lev, int YN) ERROR = 1; } - // rk4 substep and boundary - { - MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; + if (!used_gpu_substep) + { + // rk4 substep and boundary + { + MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here while (varl0) @@ -955,17 +1391,20 @@ void bssnEM_class::Step(int lev, int YN) varl = varl->next; varlrhs = varlrhs->next; } - } - f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); - } + } + f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); + } + } if (BP == Pp->data->ble) break; BP = BP->next; - } - Pp = Pp->next; - } - // check error information - { + } + Pp = Pp->next; + } + if (em_step_timing) + em_t_predictor += MPI_Wtime() - em_t0; + // check error information + { int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } @@ -1221,7 +1660,11 @@ void bssnEM_class::Step(int lev, int YN) } #endif - Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + if (em_step_timing) + em_t0 = MPI_Wtime(); + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + if (em_step_timing) + em_t_predictor_sync += MPI_Wtime() - em_t0; #ifdef WithShell if (lev == 0) @@ -1241,12 +1684,14 @@ void bssnEM_class::Step(int lev, int YN) } #endif - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { - compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { + if (em_step_timing) + em_t0 = MPI_Wtime(); + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count); @@ -1269,20 +1714,28 @@ void bssnEM_class::Step(int lev, int YN) DG_List->insert(Sfy0); DG_List->insert(Sfz0); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); - DG_List->clearList(); - } - } - } - // data analysis part - // Warning NOTE: the variables1 are used as temp storege room - if (lev == a_lev) - { - AnalysisStuff_EM(lev, dT_lev); - } - // corrector - for (iter_count = 1; iter_count < 4; iter_count++) - { - // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; + DG_List->clearList(); + } + } + if (em_step_timing) + em_t_bh += MPI_Wtime() - em_t0; + } + // data analysis part + // Warning NOTE: the variables1 are used as temp storege room + if (lev == a_lev) + { + if (em_step_timing) + em_t0 = MPI_Wtime(); + AnalysisStuff_EM(lev, dT_lev); + if (em_step_timing) + em_t_analysis += MPI_Wtime() - em_t0; + } + // corrector + for (iter_count = 1; iter_count < 4; iter_count++) + { + if (em_step_timing) + em_t0 = MPI_Wtime(); + // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; if (iter_count == 1 || iter_count == 3) TRK4 += dT_lev / 2; Pp = GH->PatL[lev]; @@ -1294,10 +1747,11 @@ void bssnEM_class::Step(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], +#if !USE_CUDA_BSSN +#if (AGM == 0) + f_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #elif (AGM == 1) @@ -1305,12 +1759,16 @@ void bssnEM_class::Step(int lev, int YN) f_enforce_ga(cg->shape, cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#endif + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); +#endif +#endif - if ( - f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2], + int em_rhs_error = 0; + bool used_gpu_substep = false; +#if !USE_CUDA_BSSN + em_rhs_error = + f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], @@ -1327,10 +1785,57 @@ void bssnEM_class::Step(int lev, int YN) cg->fgfs[Kpsi1->sgfn], cg->fgfs[Kphi1->sgfn], cg->fgfs[rho->sgfn], cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn], - cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], - cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], - Symmetry, lev, ndeps) || - f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], + cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn], + Symmetry, lev, ndeps); +#endif + +#if USE_CUDA_BSSN + if (!em_rhs_error) + { + double *state_in[BSSN_EM_CUDA_STATE_COUNT]; + double *state_out[BSSN_EM_CUDA_STATE_COUNT]; + double *sources[BSSN_EM_CUDA_SOURCE_COUNT]; + double propspeed[BSSN_EM_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT]; + if (!fill_bssn_em_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) || + !fill_bssn_em_cuda_views(cg, SynchList_cor, state_out)) + { + cout << "CUDA BSSN-EM state list mismatch on corrector step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar); + int apply_bam_bc = 0; +#if (SommerType == 0) +#ifndef WithShell + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#endif + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#elif (AGM == 1) + if (iter_count == 3) + apply_enforce_ga = 1; +#endif + int keep_resident_state = use_cuda_resident_sync ? 1 : 0; + if (bssn_em_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, sources, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, cor, + keep_resident_state, apply_enforce_ga, chitiny)) + { + ERROR = 1; + } + used_gpu_substep = true; + } +#endif + + if (em_rhs_error || + (!used_gpu_substep && + f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], @@ -1362,7 +1867,7 @@ void bssnEM_class::Step(int lev, int YN) cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], - Symmetry, lev, ndeps, cor)) + Symmetry, lev, ndeps, cor))) { cout << "find NaN in domain: (" << cg->bbox[0] << ":" << cg->bbox[3] << "," @@ -1370,9 +1875,11 @@ void bssnEM_class::Step(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } - // rk4 substep and boundary - { - MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; + if (!used_gpu_substep) + { + // rk4 substep and boundary + { + MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // we do not check the correspondence here while (varl0) @@ -1410,9 +1917,10 @@ void bssnEM_class::Step(int lev, int YN) varl1 = varl1->next; varlrhs = varlrhs->next; } - } - f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); - } + } + f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); + } + } if (BP == Pp->data->ble) break; BP = BP->next; @@ -1683,7 +2191,13 @@ void bssnEM_class::Step(int lev, int YN) } #endif + if (em_step_timing) + em_t_corrector += MPI_Wtime() - em_t0; + if (em_step_timing) + em_t0 = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); + if (em_step_timing) + em_t_corrector_sync += MPI_Wtime() - em_t0; #ifdef WithShell if (lev == 0) @@ -1702,12 +2216,14 @@ void bssnEM_class::Step(int lev, int YN) } } #endif - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { - compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { + if (em_step_timing) + em_t0 = MPI_Wtime(); + compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count); f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count); @@ -1730,16 +2246,20 @@ void bssnEM_class::Step(int lev, int YN) DG_List->insert(Sfy0); DG_List->insert(Sfz0); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); - DG_List->clearList(); - } - } - } - // swap time level - if (iter_count < 3) - { - Pp = GH->PatL[lev]; - while (Pp) - { + DG_List->clearList(); + } + } + if (em_step_timing) + em_t_bh += MPI_Wtime() - em_t0; + } + // swap time level + if (iter_count < 3) + { + if (em_step_timing) + em_t0 = MPI_Wtime(); + Pp = GH->PatL[lev]; + while (Pp) + { MyList *BP = Pp->data->blb; while (BP) { @@ -1777,15 +2297,33 @@ void bssnEM_class::Step(int lev, int YN) { Porg[ithBH][0] = Porg1[ithBH][0]; Porg[ithBH][1] = Porg1[ithBH][1]; - Porg[ithBH][2] = Porg1[ithBH][2]; - } - } - } - } - -#if (RPS == 0) - // mesh refinement boundary part - RestrictProlong(lev, YN, BB); + Porg[ithBH][2] = Porg1[ithBH][2]; + } + } + if (em_step_timing) + em_t_swap += MPI_Wtime() - em_t0; + } + } + +#if USE_CUDA_BSSN + if (use_cuda_resident_sync) + { + if (em_step_timing) + em_t0 = MPI_Wtime(); + if (!bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev)) + bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + if (em_step_timing) + em_t_resident += MPI_Wtime() - em_t0; + } +#endif + +#if (RPS == 0) + // mesh refinement boundary part + if (em_step_timing) + em_t0 = MPI_Wtime(); + RestrictProlong(lev, YN, BB); + if (em_step_timing) + em_t_rp += MPI_Wtime() - em_t0; #ifdef WithShell if (lev == 0) @@ -1807,15 +2345,17 @@ void bssnEM_class::Step(int lev, int YN) #endif // note the data structure before update - // SynchList_cor 1 ----------- - // - // StateList 0 ----------- - // - // OldStateList old ----------- - // update - Pp = GH->PatL[lev]; - while (Pp) - { + // SynchList_cor 1 ----------- + // + // StateList 0 ----------- + // + // OldStateList old ----------- + // update + if (em_step_timing) + em_t0 = MPI_Wtime(); + Pp = GH->PatL[lev]; + while (Pp) + { MyList *BP = Pp->data->blb; while (BP) { @@ -1849,16 +2389,36 @@ void bssnEM_class::Step(int lev, int YN) } #endif // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { - for (int ithBH = 0; ithBH < BH_num; ithBH++) - { - Porg0[ithBH][0] = Porg1[ithBH][0]; - Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -} + if (BH_num > 0 && lev == GH->levels - 1) + { + for (int ithBH = 0; ithBH < BH_num; ithBH++) + { + Porg0[ithBH][0] = Porg1[ithBH][0]; + Porg0[ithBH][1] = Porg1[ithBH][1]; + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } + if (em_step_timing) + { + em_t_swap += MPI_Wtime() - em_t0; + static int em_step_report_count = 0; + const int em_timing_every = bssn_em_step_timing_every(); + const bool report_all_levels = bssn_em_step_timing_all_levels_enabled(); + if (lev == GH->levels - 1) + ++em_step_report_count; + if ((report_all_levels || lev == GH->levels - 1) && + (em_timing_every <= 1 || em_step_report_count % em_timing_every == 0)) + { + fprintf(stderr, + "[AMSS-EM-STEP-TIMING] lev=%d wall=%.6f predictor=%.6f pre_sync=%.6f " + "analysis=%.6f corrector=%.6f cor_sync=%.6f bh=%.6f swap=%.6f resident=%.6f rp=%.6f\n", + lev, MPI_Wtime() - em_step_t0, + em_t_predictor, em_t_predictor_sync, + em_t_analysis, em_t_corrector, em_t_corrector_sync, + em_t_bh, em_t_swap, em_t_resident, em_t_rp); + } + } +} //================================================================================================ @@ -2034,11 +2594,64 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev) { LastAnas += dT_lev; - if (LastAnas >= AnasTime) - { - Compute_Phi2(lev); - double *RP, *IP; - int NN = 0; + if (LastAnas >= AnasTime) + { +#if USE_CUDA_BSSN + const bool zero_em_analysis = + bssn_em_analysis_zero_fastpath_ready(GH->PatL[lev], +#ifdef WithShell + SH +#else + 0 +#endif + , myrank + ); +#else + const bool zero_em_analysis = false; +#endif + if (zero_em_analysis) + { +#if USE_CUDA_BSSN + zero_em_analysis_outputs(GH->PatL[lev], +#ifdef WithShell + SH, +#else + 0, +#endif + myrank, + Rphi2, Iphi2, Rphi1, Iphi1); +#endif + int NN = 0; + for (int pl = 1; pl < maxl + 1; pl++) + for (int pm = -pl; pm < pl + 1; pm++) + NN++; + double *RP = new double[NN]; + double *IP = new double[NN]; + std::memset(RP, 0, NN * sizeof(double)); + std::memset(IP, 0, NN * sizeof(double)); + for (int i = 0; i < decn; i++) + Phi2Monitor->writefile(PhysTime, NN, RP, IP); + delete[] RP; + delete[] IP; + + NN = 0; + for (int pl = 0; pl < maxl + 1; pl++) + for (int pm = -pl; pm < pl + 1; pm++) + NN++; + RP = new double[NN]; + IP = new double[NN]; + std::memset(RP, 0, NN * sizeof(double)); + std::memset(IP, 0, NN * sizeof(double)); + for (int i = 0; i < decn; i++) + Phi1Monitor->writefile(PhysTime, NN, RP, IP); + delete[] RP; + delete[] IP; + } + else + { + Compute_Phi2(lev); + double *RP, *IP; + int NN = 0; // for phi2 for (int pl = 1; pl < maxl + 1; pl++) for (int pm = -pl; pm < pl + 1; pm++) @@ -2121,10 +2734,11 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev) #endif Phi1Monitor->writefile(PhysTime, NN, RP, IP); Rex = Rex - drex; - } - delete[] RP; - delete[] IP; - } + } + delete[] RP; + delete[] IP; + } + } AnalysisStuff(lev, dT_lev); // LastAnas need and only need control here diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index d966a74..e4d6510 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -438,7 +438,7 @@ int count_bssn_cuda_state_list(MyList *vars) { ++count; vars = vars->next; - if (count > BSSN_ESCALAR_CUDA_STATE_COUNT) + if (count > BSSN_EM_CUDA_STATE_COUNT) return -1; } return count; @@ -449,8 +449,7 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList *vars, double **host_views) { if (!cg || !host_views || - (state_count != BSSN_CUDA_STATE_COUNT && - state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)) + state_count <= 0 || state_count > BSSN_EM_CUDA_STATE_COUNT) return false; int idx = 0; while (vars && idx < state_count) @@ -742,7 +741,7 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int Block *cg = BP->data; if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { - double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double *state_out[BSSN_EM_CUDA_STATE_COUNT]; if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out)) { cout << "CUDA BSSN state list mismatch on resident state download" << endl; @@ -750,7 +749,9 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int } const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT) ? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out) - : bssn_cuda_download_resident_state(cg, cg->shape, state_out); + : ((state_count == BSSN_CUDA_STATE_COUNT) + ? bssn_cuda_download_resident_state(cg, cg->shape, state_out) + : bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count)); if (rc) { cout << "CUDA resident state download failed" << endl; @@ -779,7 +780,7 @@ void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList Block *cg = BP->data; if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { - double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double *state_out[BSSN_EM_CUDA_STATE_COUNT]; if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out)) { cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 4168f71..2e18a48 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -413,7 +413,7 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src, */ /* Total number of "all"-sized slots */ -#define NUM_SLOTS 192 +#define NUM_SLOTS 256 struct GpuBuffers { double *d_mem; /* single big allocation */ @@ -486,6 +486,19 @@ enum { S_Sphi_x, S_Sphi_y, S_Sphi_z, S_Sphi_xx, S_Sphi_xy, S_Sphi_xz, S_Sphi_yy, S_Sphi_yz, S_Sphi_zz, S_trK_x, S_trK_y, S_trK_z, + S_EM_Kpsi, S_EM_Kphi, + S_EM_Ex, S_EM_Ey, S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz, + S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar, + S_EM_Kpsi_rhs, S_EM_Kphi_rhs, + S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs, + S_EM_Kpsix, S_EM_Kpsiy, S_EM_Kpsiz, + S_EM_Kphix, S_EM_Kphiy, S_EM_Kphiz, + S_EM_Exx, S_EM_Exy, S_EM_Exz, + S_EM_Eyx, S_EM_Eyy, S_EM_Eyz, + S_EM_Ezx, S_EM_Ezy, S_EM_Ezz, + S_EM_Bxx, S_EM_Bxy, S_EM_Bxz, + S_EM_Byx, S_EM_Byy, S_EM_Byz, + S_EM_Bzx, S_EM_Bzy, S_EM_Bzz, NUM_USED_SLOTS }; @@ -503,9 +516,12 @@ static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_LK_FIELD_COUNT = 24; static constexpr int BSSN_ESCALAR_LK_FIELD_COUNT = 26; +static constexpr int BSSN_EM_STATE_COUNT = 32; +static constexpr int BSSN_EM_SOURCE_COUNT = 4; +static constexpr int BSSN_EM_LK_FIELD_COUNT = 32; static constexpr int BSSN_RESIDENT_BANK_COUNT = 6; static constexpr int BSSN_ESCALAR_STATE_COUNT = 26; -static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT; +static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_EM_STATE_COUNT; static const int k_state_input_slots[BSSN_STATE_COUNT] = { S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, @@ -543,6 +559,29 @@ static const int k_escalar_state_rhs_slots[BSSN_ESCALAR_STATE_COUNT] = { S_Sphi_rhs, S_Spi_rhs }; +static const int k_em_state_input_slots[BSSN_EM_STATE_COUNT] = { + S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, + S_Lap, S_betax, S_betay, S_betaz, + S_dtSfx, S_dtSfy, S_dtSfz, + S_EM_Kpsi, S_EM_Kphi, + S_EM_Ex, S_EM_Ey, S_EM_Ez, + S_EM_Bx, S_EM_By, S_EM_Bz +}; + +static const int k_em_state_rhs_slots[BSSN_EM_STATE_COUNT] = { + S_chi_rhs, S_trK_rhs, + S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, + S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs, + S_EM_Kpsi_rhs, S_EM_Kphi_rhs, + S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs, + S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs +}; + static const int k_matter_slots[BSSN_MATTER_COUNT] = { S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz }; @@ -603,6 +642,7 @@ struct StepContext { double *d_state_next_mem; std::array d_resident_mem; double *d_matter_mem; + double *d_em_source_mem; double *d_comm_mem; double *h_comm_mem; std::array d_state0; @@ -615,11 +655,16 @@ struct StepContext { std::array resident_age; std::array resident_valid; std::array d_matter; + std::array d_em_source; + std::array em_source_host; size_t cap_all; size_t cap_comm; bool h_comm_pinned; size_t cap_h_comm; bool matter_ready; + bool em_source_ready; + bool em_zero_fast_known; + bool em_zero_fast; bool state_ready; int current_bank; unsigned long long resident_clock; @@ -628,9 +673,11 @@ struct StepContext { : d_state0_mem(nullptr), d_accum_mem(nullptr), d_state_curr_mem(nullptr), d_state_next_mem(nullptr), d_resident_mem{}, - d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr), + d_matter_mem(nullptr), d_em_source_mem(nullptr), + d_comm_mem(nullptr), h_comm_mem(nullptr), cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0), - matter_ready(false), state_ready(false), + matter_ready(false), em_source_ready(false), + em_zero_fast_known(false), em_zero_fast(false), state_ready(false), current_bank(-1), resident_clock(0) { d_resident_mem.fill(nullptr); @@ -646,6 +693,8 @@ struct StepContext { resident_age.fill(0); resident_valid.fill(false); d_matter.fill(nullptr); + d_em_source.fill(nullptr); + em_source_host.fill(nullptr); } }; @@ -654,6 +703,7 @@ struct StepAllocation { double *d_accum_mem; std::array d_resident_mem; double *d_matter_mem; + double *d_em_source_mem; double *d_comm_mem; double *h_comm_mem; size_t cap_all; @@ -666,6 +716,7 @@ static std::unordered_map g_step_ctx; static std::vector g_step_pool; static int *g_comm_segment_meta = nullptr; static size_t g_comm_segment_meta_cap = 0; +static int *g_em_zero_flag = nullptr; static StepAllocation empty_step_allocation() { @@ -674,6 +725,7 @@ static StepAllocation empty_step_allocation() alloc.d_accum_mem = nullptr; alloc.d_resident_mem.fill(nullptr); alloc.d_matter_mem = nullptr; + alloc.d_em_source_mem = nullptr; alloc.d_comm_mem = nullptr; alloc.h_comm_mem = nullptr; alloc.cap_all = 0; @@ -695,6 +747,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx) alloc.d_accum_mem = ctx.d_accum_mem; alloc.d_resident_mem = ctx.d_resident_mem; alloc.d_matter_mem = ctx.d_matter_mem; + alloc.d_em_source_mem = ctx.d_em_source_mem; alloc.d_comm_mem = ctx.d_comm_mem; alloc.h_comm_mem = ctx.h_comm_mem; alloc.cap_all = ctx.cap_all; @@ -707,6 +760,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.d_state_next_mem = nullptr; ctx.d_resident_mem.fill(nullptr); ctx.d_matter_mem = nullptr; + ctx.d_em_source_mem = nullptr; ctx.d_comm_mem = nullptr; ctx.h_comm_mem = nullptr; ctx.cap_all = 0; @@ -714,6 +768,9 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.h_comm_pinned = false; ctx.cap_h_comm = 0; ctx.matter_ready = false; + ctx.em_source_ready = false; + ctx.em_zero_fast_known = false; + ctx.em_zero_fast = false; ctx.state_ready = false; ctx.current_bank = -1; ctx.resident_clock = 0; @@ -729,6 +786,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx) ctx.resident_age.fill(0); ctx.resident_valid.fill(false); ctx.d_matter.fill(nullptr); + ctx.d_em_source.fill(nullptr); + ctx.em_source_host.fill(nullptr); return alloc; } @@ -740,6 +799,7 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.d_state_curr_mem = nullptr; ctx.d_state_next_mem = nullptr; ctx.d_matter_mem = alloc.d_matter_mem; + ctx.d_em_source_mem = alloc.d_em_source_mem; ctx.d_comm_mem = alloc.d_comm_mem; ctx.h_comm_mem = alloc.h_comm_mem; ctx.cap_all = alloc.cap_all; @@ -747,6 +807,9 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.h_comm_pinned = alloc.h_comm_pinned; ctx.cap_h_comm = alloc.cap_h_comm; ctx.matter_ready = false; + ctx.em_source_ready = false; + ctx.em_zero_fast_known = false; + ctx.em_zero_fast = false; ctx.state_ready = false; ctx.current_bank = -1; ctx.resident_clock = 0; @@ -756,6 +819,7 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc } ctx.resident_age.fill(0); ctx.resident_valid.fill(false); + ctx.em_source_host.fill(nullptr); } static void recycle_step_allocation(StepAllocation &alloc) @@ -849,6 +913,7 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double))); } CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_em_source_mem, BSSN_EM_SOURCE_COUNT * all * sizeof(double))); alloc.cap_all = all; } attach_step_allocation(ctx, alloc); @@ -867,6 +932,9 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all; } + for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) { + ctx.d_em_source[i] = ctx.d_em_source_mem + (size_t)i * all; + } return ctx; } @@ -1569,10 +1637,10 @@ void kern_kodis(const double * __restrict__ fh, /* Host wrapper helpers */ /* ================================================================== */ struct LopsidedKodisTables { - const double *adv_fields[BSSN_ESCALAR_LK_FIELD_COUNT]; - const double *ko_fields[BSSN_ESCALAR_LK_FIELD_COUNT]; - double *rhs_fields[BSSN_ESCALAR_LK_FIELD_COUNT]; - int soa_signs[3 * BSSN_ESCALAR_LK_FIELD_COUNT]; + const double *adv_fields[BSSN_EM_LK_FIELD_COUNT]; + const double *ko_fields[BSSN_EM_LK_FIELD_COUNT]; + double *rhs_fields[BSSN_EM_LK_FIELD_COUNT]; + int soa_signs[3 * BSSN_EM_LK_FIELD_COUNT]; }; struct FDerivTables { @@ -1603,19 +1671,19 @@ struct Phase10RicciTables { }; struct Rk4FinalizeTables { - const double *f0_fields[BSSN_STATE_COUNT]; - double *rhs_fields[BSSN_STATE_COUNT]; - double *accum_fields[BSSN_STATE_COUNT]; + const double *f0_fields[BSSN_EM_STATE_COUNT]; + double *rhs_fields[BSSN_EM_STATE_COUNT]; + double *accum_fields[BSSN_EM_STATE_COUNT]; }; struct PatchBoundaryTables { - const double *src_fields[BSSN_STATE_COUNT]; - double *dst_fields[BSSN_STATE_COUNT]; + const double *src_fields[BSSN_EM_STATE_COUNT]; + double *dst_fields[BSSN_EM_STATE_COUNT]; }; struct EScalarBoundaryTables { - const double *f0_fields[BSSN_ESCALAR_STATE_COUNT]; - double *out_fields[BSSN_ESCALAR_STATE_COUNT]; + const double *f0_fields[BSSN_EM_STATE_COUNT]; + double *out_fields[BSSN_EM_STATE_COUNT]; }; static const int BLK = 128; @@ -2731,6 +2799,190 @@ static void gpu_escalar_sources(int all) #undef D } +__global__ __launch_bounds__(128, 3) +void kern_em_rhs_sources( + const double * __restrict__ chi, + const double * __restrict__ dxx, const double * __restrict__ dxy, const double * __restrict__ dxz, + const double * __restrict__ dyy, const double * __restrict__ dyz, const double * __restrict__ dzz, + const double * __restrict__ Lap, + const double * __restrict__ betax, const double * __restrict__ betay, const double * __restrict__ betaz, + const double * __restrict__ trK, + const double * __restrict__ Ex, const double * __restrict__ Ey, const double * __restrict__ Ez, + const double * __restrict__ Bx, const double * __restrict__ By, const double * __restrict__ Bz, + const double * __restrict__ Kpsi, const double * __restrict__ Kphi, + const double * __restrict__ Jx, const double * __restrict__ Jy, const double * __restrict__ Jz, + const double * __restrict__ qchar, + const double * __restrict__ chix, const double * __restrict__ chiy, const double * __restrict__ chiz, + const double * __restrict__ gxxx, const double * __restrict__ gxyx, const double * __restrict__ gxzx, + const double * __restrict__ gyyx, const double * __restrict__ gyzx, const double * __restrict__ gzzx, + const double * __restrict__ gxxy, const double * __restrict__ gxyy, const double * __restrict__ gxzy, + const double * __restrict__ gyyy, const double * __restrict__ gyzy, const double * __restrict__ gzzy, + const double * __restrict__ gxxz, const double * __restrict__ gxyz, const double * __restrict__ gxzz, + const double * __restrict__ gyyz, const double * __restrict__ gyzz, const double * __restrict__ gzzz, + const double * __restrict__ Lapx, const double * __restrict__ Lapy, const double * __restrict__ Lapz, + const double * __restrict__ betaxx, const double * __restrict__ betaxy, const double * __restrict__ betaxz, + const double * __restrict__ betayx, const double * __restrict__ betayy, const double * __restrict__ betayz, + const double * __restrict__ betazx, const double * __restrict__ betazy, const double * __restrict__ betazz, + const double * __restrict__ Kpsix, const double * __restrict__ Kpsiy, const double * __restrict__ Kpsiz, + const double * __restrict__ Kphix, const double * __restrict__ Kphiy, const double * __restrict__ Kphiz, + const double * __restrict__ Exx, const double * __restrict__ Exy, const double * __restrict__ Exz, + const double * __restrict__ Eyx, const double * __restrict__ Eyy, const double * __restrict__ Eyz, + const double * __restrict__ Ezx, const double * __restrict__ Ezy, const double * __restrict__ Ezz, + const double * __restrict__ Bxx, const double * __restrict__ Bxy, const double * __restrict__ Bxz, + const double * __restrict__ Byx, const double * __restrict__ Byy, const double * __restrict__ Byz, + const double * __restrict__ Bzx, const double * __restrict__ Bzy, const double * __restrict__ Bzz, + double * __restrict__ Kpsi_rhs, double * __restrict__ Kphi_rhs, + double * __restrict__ Ex_rhs, double * __restrict__ Ey_rhs, double * __restrict__ Ez_rhs, + double * __restrict__ Bx_rhs, double * __restrict__ By_rhs, double * __restrict__ Bz_rhs, + double * __restrict__ rho, double * __restrict__ Sx, double * __restrict__ Sy, double * __restrict__ Sz, + double * __restrict__ Sxx, double * __restrict__ Sxy, double * __restrict__ Sxz, + double * __restrict__ Syy, double * __restrict__ Syz, double * __restrict__ Szz) +{ + const int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= d_gp.all) return; + + constexpr double ONE = 1.0, TWO = 2.0, FOUR = 4.0, F3o2 = 1.5, EIT = 8.0, kappa = 1.0; + constexpr double PI = 3.141592653589793238462643383279502884; + const double alpn1 = Lap[i] + ONE; + const double chin1 = chi[i] + ONE; + const double sqc = sqrt(chin1); + const double chi3o2 = sqc * sqc * sqc; + const double gxxv = (dxx[i] + ONE) / chin1; + const double gxyv = dxy[i] / chin1; + const double gxzv = dxz[i] / chin1; + const double gyyv = (dyy[i] + ONE) / chin1; + const double gyzv = dyz[i] / chin1; + const double gzzv = (dzz[i] + ONE) / chin1; + const double gxxxv = (gxxx[i] - gxxv * chix[i]) / chin1; + const double gxxyv = (gxxy[i] - gxxv * chiy[i]) / chin1; + const double gxxzv = (gxxz[i] - gxxv * chiz[i]) / chin1; + const double gxyxv = (gxyx[i] - gxyv * chix[i]) / chin1; + const double gxyyv = (gxyy[i] - gxyv * chiy[i]) / chin1; + const double gxyzv = (gxyz[i] - gxyv * chiz[i]) / chin1; + const double gxzxv = (gxzx[i] - gxzv * chix[i]) / chin1; + const double gxzyv = (gxzy[i] - gxzv * chiy[i]) / chin1; + const double gxzzv = (gxzz[i] - gxzv * chiz[i]) / chin1; + const double gyyxv = (gyyx[i] - gyyv * chix[i]) / chin1; + const double gyyyv = (gyyy[i] - gyyv * chiy[i]) / chin1; + const double gyyzv = (gyyz[i] - gyyv * chiz[i]) / chin1; + const double gyzxv = (gyzx[i] - gyzv * chix[i]) / chin1; + const double gyzyv = (gyzy[i] - gyzv * chiy[i]) / chin1; + const double gyzzv = (gyzz[i] - gyzv * chiz[i]) / chin1; + const double gzzxv = (gzzx[i] - gzzv * chix[i]) / chin1; + const double gzzyv = (gzzy[i] - gzzv * chiy[i]) / chin1; + const double gzzzv = (gzzz[i] - gzzv * chiz[i]) / chin1; + const double det = gxxv*gyyv*gzzv + gxyv*gyzv*gxzv + gxzv*gxyv*gyzv - gxzv*gyyv*gxzv - gxyv*gxyv*gzzv - gxxv*gyzv*gyzv; + const double gupxx = (gyyv*gzzv - gyzv*gyzv) / det; + const double gupxy = -(gxyv*gzzv - gyzv*gxzv) / det; + const double gupxz = (gxyv*gyzv - gyyv*gxzv) / det; + const double gupyy = (gxxv*gzzv - gxzv*gxzv) / det; + const double gupyz = -(gxxv*gyzv - gxyv*gxzv) / det; + const double gupzz = (gxxv*gyyv - gxyv*gxyv) / det; + + Ex_rhs[i] = alpn1*trK[i]*Ex[i]-(Ex[i]*betaxx[i]+Ey[i]*betaxy[i]+Ez[i]*betaxz[i])-FOUR*PI*alpn1*Jx[i]-alpn1*(gupxx*Kpsix[i]+gupxy*Kpsiy[i]+gupxz*Kpsiz[i]) + + chi3o2*(((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapy[i]+alpn1*(gxzv*Bxy[i]+gyzv*Byy[i]+gzzv*Bzy[i])+alpn1*(Bx[i]*gxzyv+By[i]*gyzyv+Bz[i]*gzzyv))-((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapz[i]+alpn1*(gxyv*Bxz[i]+gyyv*Byz[i]+gyzv*Bzz[i])+alpn1*(Bx[i]*gxyzv+By[i]*gyyzv+Bz[i]*gyzzv))); + Ey_rhs[i] = alpn1*trK[i]*Ey[i]-(Ex[i]*betayx[i]+Ey[i]*betayy[i]+Ez[i]*betayz[i])-FOUR*PI*alpn1*Jy[i]-alpn1*(gupxy*Kpsix[i]+gupyy*Kpsiy[i]+gupyz*Kpsiz[i]) + + chi3o2*(((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapz[i]+alpn1*(gxxv*Bxz[i]+gxyv*Byz[i]+gxzv*Bzz[i])+alpn1*(Bx[i]*gxxzv+By[i]*gxyzv+Bz[i]*gxzzv))-((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapx[i]+alpn1*(gxzv*Bxx[i]+gyzv*Byx[i]+gzzv*Bzx[i])+alpn1*(Bx[i]*gxzxv+By[i]*gyzxv+Bz[i]*gzzxv))); + Ez_rhs[i] = alpn1*trK[i]*Ez[i]-(Ex[i]*betazx[i]+Ey[i]*betazy[i]+Ez[i]*betazz[i])-FOUR*PI*alpn1*Jz[i]-alpn1*(gupxz*Kpsix[i]+gupyz*Kpsiy[i]+gupzz*Kpsiz[i]) + + chi3o2*(((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapx[i]+alpn1*(gxyv*Bxx[i]+gyyv*Byx[i]+gyzv*Bzx[i])+alpn1*(Bx[i]*gxyxv+By[i]*gyyxv+Bz[i]*gyzxv))-((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapy[i]+alpn1*(gxxv*Bxy[i]+gxyv*Byy[i]+gxzv*Bzy[i])+alpn1*(Bx[i]*gxxyv+By[i]*gxyyv+Bz[i]*gxzyv))); + Bx_rhs[i] = alpn1*trK[i]*Bx[i]-(Bx[i]*betaxx[i]+By[i]*betaxy[i]+Bz[i]*betaxz[i])-alpn1*(gupxx*Kphix[i]+gupxy*Kphiy[i]+gupxz*Kphiz[i]) + - chi3o2*(((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapy[i]+alpn1*(gxzv*Exy[i]+gyzv*Eyy[i]+gzzv*Ezy[i])+alpn1*(Ex[i]*gxzyv+Ey[i]*gyzyv+Ez[i]*gzzyv))-((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapz[i]+alpn1*(gxyv*Exz[i]+gyyv*Eyz[i]+gyzv*Ezz[i])+alpn1*(Ex[i]*gxyzv+Ey[i]*gyyzv+Ez[i]*gyzzv))); + By_rhs[i] = alpn1*trK[i]*By[i]-(Bx[i]*betayx[i]+By[i]*betayy[i]+Bz[i]*betayz[i])-alpn1*(gupxy*Kphix[i]+gupyy*Kphiy[i]+gupyz*Kphiz[i]) + - chi3o2*(((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapz[i]+alpn1*(gxxv*Exz[i]+gxyv*Eyz[i]+gxzv*Ezz[i])+alpn1*(Ex[i]*gxxzv+Ey[i]*gxyzv+Ez[i]*gxzzv))-((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapx[i]+alpn1*(gxzv*Exx[i]+gyzv*Eyx[i]+gzzv*Ezx[i])+alpn1*(Ex[i]*gxzxv+Ey[i]*gyzxv+Ez[i]*gzzxv))); + Bz_rhs[i] = alpn1*trK[i]*Bz[i]-(Bx[i]*betazx[i]+By[i]*betazy[i]+Bz[i]*betazz[i])-alpn1*(gupxz*Kphix[i]+gupyz*Kphiy[i]+gupzz*Kphiz[i]) + - chi3o2*(((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapx[i]+alpn1*(gxyv*Exx[i]+gyyv*Eyx[i]+gyzv*Ezx[i])+alpn1*(Ex[i]*gxyxv+Ey[i]*gyyxv+Ez[i]*gyzxv))-((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapy[i]+alpn1*(gxxv*Exy[i]+gxyv*Eyy[i]+gxzv*Ezy[i])+alpn1*(Ex[i]*gxxyv+Ey[i]*gxyyv+Ez[i]*gxzyv))); + Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i]-alpn1*kappa*Kpsi[i]-alpn1*(Exx[i]+Eyy[i]+Ezz[i]-F3o2/chin1*(chix[i]*Ex[i]+chiy[i]*Ey[i]+chiz[i]*Ez[i])); + Kphi_rhs[i] = -alpn1*kappa*Kphi[i]-alpn1*(Bxx[i]+Byy[i]+Bzz[i]-F3o2/chin1*(chix[i]*Bx[i]+chiy[i]*By[i]+chiz[i]*Bz[i])); + + const double lrho = (gxxv*(Ex[i]*Ex[i]+Bx[i]*Bx[i])+gyyv*(Ey[i]*Ey[i]+By[i]*By[i])+gzzv*(Ez[i]*Ez[i]+Bz[i]*Bz[i])+TWO*(gxyv*(Ex[i]*Ey[i]+Bx[i]*By[i])+gxzv*(Ex[i]*Ez[i]+Bx[i]*Bz[i])+gyzv*(Ey[i]*Ez[i]+By[i]*Bz[i])))/EIT/PI; + rho[i] = lrho; + Sx[i] = (Ey[i]*Bz[i]-Ez[i]*By[i])/FOUR/PI/chi3o2; + Sy[i] = (Ez[i]*Bx[i]-Ex[i]*Bz[i])/FOUR/PI/chi3o2; + Sz[i] = (Ex[i]*By[i]-Ey[i]*Bx[i])/FOUR/PI/chi3o2; + const double lEx = gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i], lEy = gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i], lEz = gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i]; + const double lBx = gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i], lBy = gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i], lBz = gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i]; + Sxx[i] = lrho*gxxv-(lEx*lEx+lBx*lBx)/FOUR/PI; + Sxy[i] = lrho*gxyv-(lEx*lEy+lBx*lBy)/FOUR/PI; + Sxz[i] = lrho*gxzv-(lEx*lEz+lBx*lBz)/FOUR/PI; + Syy[i] = lrho*gyyv-(lEy*lEy+lBy*lBy)/FOUR/PI; + Syz[i] = lrho*gyzv-(lEy*lEz+lBy*lBz)/FOUR/PI; + Szz[i] = lrho*gzzv-(lEz*lEz+lBz*lBz)/FOUR/PI; +} + +static void gpu_lopsided_kodis_em_batch(double eps_val, int all) +{ + LopsidedKodisTables tables = {}; + const int adv_slots[8] = {S_EM_Kpsi, S_EM_Kphi, S_EM_Ex, S_EM_Ey, + S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz}; + const int rhs_slots[8] = {S_EM_Kpsi_rhs, S_EM_Kphi_rhs, S_EM_Ex_rhs, S_EM_Ey_rhs, + S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs}; + const int signs[24] = { + 1, 1, 1, 1, 1, 1, + -1, 1, 1, 1,-1, 1, 1, 1,-1, + 1,-1,-1, -1, 1,-1, -1,-1, 1 + }; + for (int i = 0; i < 8; ++i) { + tables.adv_fields[i] = g_buf.slot[adv_slots[i]]; + tables.ko_fields[i] = g_buf.slot[adv_slots[i]]; + tables.rhs_fields[i] = g_buf.slot[rhs_slots[i]]; + tables.soa_signs[3*i+0] = signs[3*i+0]; + tables.soa_signs[3*i+1] = signs[3*i+1]; + tables.soa_signs[3*i+2] = signs[3*i+2]; + } + dim3 launch_grid((unsigned int)grid((size_t)all), 8u); + kern_lopsided_kodis_batched<<>>( + g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val); +} + +static void gpu_em_rhs_sources(int all, double eps) +{ +#define D(s) g_buf.slot[s] + double *src[] = {D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_chi), + D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz), + D(S_EM_Kpsi), D(S_EM_Kphi), + D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz)}; + double *fx[] = {D(S_Lapx), D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_EM_Kpsix), D(S_EM_Kphix), + D(S_EM_Exx), D(S_EM_Eyx), D(S_EM_Ezx), D(S_EM_Bxx), D(S_EM_Byx), D(S_EM_Bzx)}; + double *fy[] = {D(S_Lapy), D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_EM_Kpsiy), D(S_EM_Kphiy), + D(S_EM_Exy), D(S_EM_Eyy), D(S_EM_Ezy), D(S_EM_Bxy), D(S_EM_Byy), D(S_EM_Bzy)}; + double *fz[] = {D(S_Lapz), D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_EM_Kpsiz), D(S_EM_Kphiz), + D(S_EM_Exz), D(S_EM_Eyz), D(S_EM_Ezz), D(S_EM_Bxz), D(S_EM_Byz), D(S_EM_Bzz)}; + const int soa[] = { + 1,1,1, -1,1,1, 1,-1,1, 1,1,-1, 1,1,1, + 1,1,1, -1,-1,1, -1,1,-1, 1,1,1, 1,-1,-1, 1,1,1, + 1,1,1, 1,1,1, + -1,1,1, 1,-1,1, 1,1,-1, 1,-1,-1, -1,1,-1, -1,-1,1 + }; + gpu_fderivs_batch(19, src, fx, fy, fz, soa, all); + kern_em_rhs_sources<<>>( + D(S_chi), D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz), + D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_trK), + D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz), + D(S_EM_Kpsi), D(S_EM_Kphi), D(S_EM_Jx), D(S_EM_Jy), D(S_EM_Jz), D(S_EM_qchar), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx), + D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy), + D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz), + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_betaxx), D(S_betaxy), D(S_betaxz), D(S_betayx), D(S_betayy), D(S_betayz), + D(S_betazx), D(S_betazy), D(S_betazz), + D(S_EM_Kpsix), D(S_EM_Kpsiy), D(S_EM_Kpsiz), D(S_EM_Kphix), D(S_EM_Kphiy), D(S_EM_Kphiz), + D(S_EM_Exx), D(S_EM_Exy), D(S_EM_Exz), D(S_EM_Eyx), D(S_EM_Eyy), D(S_EM_Eyz), + D(S_EM_Ezx), D(S_EM_Ezy), D(S_EM_Ezz), D(S_EM_Bxx), D(S_EM_Bxy), D(S_EM_Bxz), + D(S_EM_Byx), D(S_EM_Byy), D(S_EM_Byz), D(S_EM_Bzx), D(S_EM_Bzy), D(S_EM_Bzz), + D(S_EM_Kpsi_rhs), D(S_EM_Kphi_rhs), D(S_EM_Ex_rhs), D(S_EM_Ey_rhs), D(S_EM_Ez_rhs), + D(S_EM_Bx_rhs), D(S_EM_By_rhs), D(S_EM_Bz_rhs), + D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz)); + gpu_lopsided_kodis_em_batch(eps, all); +#undef D +} + __global__ void kern_rk4_finalize(const double * __restrict__ f0, double * __restrict__ frhs, double * __restrict__ accum, @@ -2836,6 +3088,23 @@ static void gpu_escalar_rk4_finalize_batch(const StepContext &ctx, ctx.d_accum[25], dT, rk4_stage); } +static void gpu_em_rk4_finalize_batch(const StepContext &ctx, + size_t all, + double dT, + int rk4_stage, + double chitiny) +{ + Rk4FinalizeTables tables = {}; + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + tables.f0_fields[i] = ctx.d_state0[i]; + tables.rhs_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]]; + tables.accum_fields[i] = ctx.d_accum[i]; + } + + dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_EM_STATE_COUNT); + kern_rk4_finalize_batched<<>>(tables, dT, rk4_stage, chitiny); +} + __global__ __launch_bounds__(128, 4) void kern_copy_patch_boundary_batched(PatchBoundaryTables tables, int touch_xmin, int touch_xmax, @@ -2936,6 +3205,27 @@ static void gpu_escalar_restore_patch_boundary_batch(const StepContext &ctx, touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax); } +static void gpu_em_restore_patch_boundary_batch(const StepContext &ctx, + int all, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax)) + return; + + EScalarBoundaryTables tables = {}; + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + tables.f0_fields[i] = ctx.d_state0[i]; + tables.out_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]]; + } + + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_EM_STATE_COUNT); + kern_escalar_restore_patch_boundary_batched<<>>( + tables, + touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax); +} + __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, double * __restrict__ gxy, double * __restrict__ gxz, @@ -5294,7 +5584,22 @@ static void bind_escalar_state_output_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + g_buf.slot[k_em_state_input_slots[i]] = state[i]; + } +} + +static void bind_em_state_output_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + g_buf.slot[k_em_state_rhs_slots[i]] = state[i]; + } +} + static void upload_escalar_state_inputs(double **state_host, size_t all); +static void upload_em_state_inputs(double **state_host, size_t all); static bool resident_key_matches_count(const StepContext &ctx, int bank, double **host_key, int state_count) { @@ -5564,6 +5869,37 @@ static int choose_escalar_resident_bank_for_reuse(StepContext &ctx, int avoid_ba return best; } +static int choose_em_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all) +{ + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (b != avoid_bank && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (b == avoid_bank) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) best = 0; + writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + static void assign_resident_key_count(StepContext &ctx, int bank, double **host_key, int state_count) { for (int i = 0; i < state_count; ++i) { @@ -5658,6 +5994,47 @@ static int ensure_escalar_resident_bank(StepContext &ctx, return bank; } +static int ensure_em_resident_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_bank = -1) +{ + if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) { + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; + } + + int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_em_state_input_slots(ctx.d_resident[bank]); + upload_em_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } + return bank; + } + + bank = choose_em_resident_bank_for_reuse(ctx, avoid_bank, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT); + if (upload_if_missing) { + bind_em_state_input_slots(ctx.d_resident[bank]); + upload_em_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } else { + ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); + } + update_state_ready(ctx); + return bank; +} + static int reserve_resident_output_bank(StepContext &ctx, double **host_key, size_t all, @@ -5698,6 +6075,26 @@ static int reserve_escalar_resident_output_bank(StepContext &ctx, return bank; } +static int reserve_em_resident_output_bank(StepContext &ctx, + double **host_key, + size_t all, + int input_bank) +{ + if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_EM_STATE_COUNT)) + return input_bank; + + int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT); + if (bank < 0) + bank = choose_em_resident_bank_for_reuse(ctx, input_bank, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c); static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx, @@ -5706,6 +6103,12 @@ static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx, int avoid_c, size_t all); +static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx, + int avoid_a, + int avoid_b, + int avoid_c, + size_t all); + static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx, double **host_key, size_t all, @@ -5740,6 +6143,40 @@ static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx, return bank; } +static int reserve_em_resident_output_bank_avoiding(StepContext &ctx, + double **host_key, + size_t all, + int avoid_a, + int avoid_b, + int avoid_c) +{ + if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_EM_STATE_COUNT)) + return avoid_a; + if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_EM_STATE_COUNT)) + return avoid_b; + if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_EM_STATE_COUNT)) + return avoid_c; + + int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT); + if (bank < 0) { + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) { + bank = b; + break; + } + } + } + if (bank < 0) + bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c) { return bank == avoid_a || bank == avoid_b || bank == avoid_c; @@ -5845,6 +6282,43 @@ static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx, return best; } +static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx, + int avoid_a, + int avoid_b, + int avoid_c, + size_t all) +{ + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) + return choose_em_resident_bank_for_reuse(ctx, avoid_a, all); + + writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + static int ensure_resident_bank_avoiding(StepContext &ctx, double **host_key, size_t all, @@ -5886,6 +6360,49 @@ static int ensure_resident_bank_avoiding(StepContext &ctx, return bank; } +static int ensure_em_resident_bank_avoiding(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_a, + int avoid_b, + int avoid_c) +{ + if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) { + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; + } + + int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_em_state_input_slots(ctx.d_resident[bank]); + upload_em_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } + return bank; + } + + bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT); + if (upload_if_missing) { + bind_em_state_input_slots(ctx.d_resident[bank]); + upload_em_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } else { + ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); + } + update_state_ready(ctx); + return bank; +} + static int ensure_escalar_resident_bank_avoiding(StepContext &ctx, double **host_key, size_t all, @@ -5941,6 +6458,12 @@ static int active_or_keyed_bank(StepContext &ctx, mark_resident_current_bank(ctx, bank); return bank; } + if (state_count == BSSN_EM_STATE_COUNT && + resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) { + int bank = ensure_em_resident_bank(ctx, host_key, all, upload_if_missing); + mark_resident_current_bank(ctx, bank); + return bank; + } if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) { int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing); mark_resident_current_bank(ctx, bank); @@ -6306,6 +6829,17 @@ static void download_escalar_state_outputs(double **state_host_out, size_t all) CUDA_CHECK(cudaDeviceSynchronize()); } +static void download_em_state_outputs(double **state_host_out, size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], + g_buf.slot[k_em_state_rhs_slots[i]], + bytes, cudaMemcpyDeviceToHost)); + } + CUDA_CHECK(cudaDeviceSynchronize()); +} + static void upload_escalar_state_inputs(double **state_host, size_t all) { const size_t bytes = all * sizeof(double); @@ -6315,6 +6849,146 @@ static void upload_escalar_state_inputs(double **state_host, size_t all) } } +static void upload_em_state_inputs(double **state_host, size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_em_state_input_slots[i]], + state_host[i], bytes, cudaMemcpyHostToDevice)); + } +} + +static bool em_source_cache_enabled() +{ + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_CUDA_EM_CACHE_SOURCES"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + return enabled != 0; +} + +static void bind_em_fixed_source_slots(StepContext &ctx) +{ + const int slots[BSSN_EM_SOURCE_COUNT] = { + S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar + }; + for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) + g_buf.slot[slots[i]] = ctx.d_em_source[i]; +} + +static bool em_fixed_source_cache_matches(const StepContext &ctx, + double **source_host) +{ + if (!ctx.em_source_ready || !source_host) + return false; + for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) { + if (!source_host[i] || ctx.em_source_host[i] != source_host[i]) + return false; + } + return true; +} + +static void upload_em_fixed_sources(StepContext &ctx, + double **source_host, + size_t all) +{ + const size_t bytes = all * sizeof(double); + bind_em_fixed_source_slots(ctx); + if (em_source_cache_enabled() && + em_fixed_source_cache_matches(ctx, source_host)) { + return; + } + for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(ctx.d_em_source[i], + source_host[i], bytes, cudaMemcpyHostToDevice)); + } + if (em_source_cache_enabled()) { + for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) + ctx.em_source_host[i] = source_host[i]; + ctx.em_source_ready = true; + } else { + ctx.em_source_host.fill(nullptr); + ctx.em_source_ready = false; + } +} + +__global__ void kern_check_em_zero_fields(int *flag, + const double * __restrict__ Kpsi, + const double * __restrict__ Kphi, + const double * __restrict__ Ex, + const double * __restrict__ Ey, + const double * __restrict__ Ez, + const double * __restrict__ Bx, + const double * __restrict__ By, + const double * __restrict__ Bz, + const double * __restrict__ Jx, + const double * __restrict__ Jy, + const double * __restrict__ Jz, + const double * __restrict__ qchar, + int all) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < all; + i += blockDim.x * gridDim.x) + { + if (Kpsi[i] != 0.0 || Kphi[i] != 0.0 || + Ex[i] != 0.0 || Ey[i] != 0.0 || Ez[i] != 0.0 || + Bx[i] != 0.0 || By[i] != 0.0 || Bz[i] != 0.0 || + Jx[i] != 0.0 || Jy[i] != 0.0 || Jz[i] != 0.0 || + qchar[i] != 0.0) + { + atomicExch(flag, 0); + } + } +} + +static bool em_zero_fast_path_enabled() +{ + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_CUDA_EM_ZERO_FASTPATH"); + enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1; + } + return enabled != 0; +} + +static bool em_detect_zero_fast_path(StepContext &ctx, + int input_bank, + size_t all) +{ + if (!em_zero_fast_path_enabled()) + return false; + if (ctx.em_zero_fast_known) + return ctx.em_zero_fast; + if (input_bank < 0 || input_bank >= BSSN_RESIDENT_BANK_COUNT) + return false; + + if (!g_em_zero_flag) + CUDA_CHECK(cudaMalloc(&g_em_zero_flag, sizeof(int))); + + int h_flag = 1; + CUDA_CHECK(cudaMemcpy(g_em_zero_flag, &h_flag, sizeof(int), cudaMemcpyHostToDevice)); + kern_check_em_zero_fields<<>>( + g_em_zero_flag, + ctx.d_resident[input_bank][24], ctx.d_resident[input_bank][25], + ctx.d_resident[input_bank][26], ctx.d_resident[input_bank][27], + ctx.d_resident[input_bank][28], ctx.d_resident[input_bank][29], + ctx.d_resident[input_bank][30], ctx.d_resident[input_bank][31], + ctx.d_em_source[0], ctx.d_em_source[1], ctx.d_em_source[2], ctx.d_em_source[3], + (int)all); + CUDA_CHECK(cudaMemcpy(&h_flag, g_em_zero_flag, sizeof(int), cudaMemcpyDeviceToHost)); + ctx.em_zero_fast = (h_flag != 0); + ctx.em_zero_fast_known = true; + return ctx.em_zero_fast; +} + +static void zero_em_output_slots_async(size_t all) +{ + for (int i = BSSN_STATE_COUNT; i < BSSN_EM_STATE_COUNT; ++i) + CUDA_CHECK(cudaMemsetAsync(g_buf.slot[k_em_state_rhs_slots[i]], 0, all * sizeof(double))); +} + static void download_constraint_outputs(double **constraint_host_out, size_t all) { const size_t bytes = all * sizeof(double); @@ -7342,6 +8016,10 @@ static void upload_resident_state_count(void *block_tag, int *ex, double **state bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false); bind_escalar_state_input_slots(ctx.d_resident[bank]); upload_escalar_state_inputs(state_host_in, all); + } else if (state_count == BSSN_EM_STATE_COUNT) { + bank = ensure_em_resident_bank(ctx, state_host_in, all, false); + bind_em_state_input_slots(ctx.d_resident[bank]); + upload_em_state_inputs(state_host_in, all); } else if (state_count == BSSN_STATE_COUNT) { bank = ensure_resident_bank(ctx, state_host_in, all, false); bind_state_input_slots(ctx.d_resident[bank]); @@ -8006,9 +8684,10 @@ int bssn_cuda_rk4_substep(void *block_tag, CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, state0_src, (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToDevice)); - } else if (!ctx.matter_ready) { - if (use_zero_matter) zero_matter_cache(ctx, all); - else upload_matter_cache(ctx, matter_host, all); + } else if (use_zero_matter) { + if (!ctx.matter_ready) zero_matter_cache(ctx, all); + } else { + upload_matter_cache(ctx, matter_host, all); } bind_matter_slots(ctx); if (profile) { @@ -8083,6 +8762,203 @@ int bssn_cuda_rk4_substep(void *block_tag, return 0; } +extern "C" +int bssn_em_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + double **source_host, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny) +{ + (void)T; + if (RK4 < 0 || RK4 > 3) return 1; + + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + const bool profile = cuda_profile_enabled(); + const double t_total0 = profile ? cuda_profile_now_ms() : 0.0; + double state_ms = 0.0; + double em_ms = 0.0; + double rhs_ms = 0.0; + double bc_ms = 0.0; + double finalize_ms = 0.0; + double output_ms = 0.0; + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + int touch_xmin = 0, touch_xmax = 0; + int touch_ymin = 0, touch_ymax = 0; + int touch_zmin = 0, touch_zmax = 0; + + setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + if (Lev > 0) { + compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } + + StepContext &ctx = ensure_step_ctx(block_tag, all); + double t0 = profile ? cuda_profile_now_ms() : 0.0; + const bool use_resident_state = (keep_resident_state != 0); + int input_bank = -1; + int output_bank = -1; + if (use_resident_state) { + input_bank = ensure_em_resident_bank(ctx, state_host_in, all, true); + output_bank = reserve_em_resident_output_bank(ctx, state_host_out, all, input_bank); + mark_resident_current_bank(ctx, input_bank); + mark_resident_next_bank(ctx, output_bank); + bind_em_state_input_slots(ctx.d_resident[input_bank]); + bind_em_state_output_slots(ctx.d_resident[output_bank]); + } else { + upload_em_state_inputs(state_host_in, all); + } + upload_em_fixed_sources(ctx, source_host, all); + const bool use_em_zero_fast = + use_resident_state && em_detect_zero_fast_path(ctx, input_bank, all); + + if (apply_enforce_ga) { + kern_enforce_ga_cuda<<>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz], + g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], + g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], + g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + if (use_resident_state && input_bank >= 0) + set_resident_host_clean(ctx, input_bank, false); + } + if (profile) { + cuda_profile_sync(); + state_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (RK4 == 0) { + if (use_resident_state) { + CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank], + (size_t)BSSN_EM_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice)); + } else { + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpy(ctx.d_state0[i], g_buf.slot[k_em_state_input_slots[i]], + bytes, cudaMemcpyDeviceToDevice)); + } + } + } + if (profile) { + cuda_profile_sync(); + state_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (use_em_zero_fast) { + zero_matter_cache(ctx, all); + bind_matter_slots(ctx); + zero_em_output_slots_async(all); + } else { + gpu_em_rhs_sources((int)all, eps); + } + if (profile) { + cuda_profile_sync(); + em_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + launch_rhs_pipeline((int)all, eps, co); + if (profile) { + cuda_profile_sync(); + rhs_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (apply_bam_bc) { + for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) { + gpu_sommerfeld_routbam(g_buf.slot[k_em_state_input_slots[i]], + g_buf.slot[k_em_state_rhs_slots[i]], + propspeed[i], + soa_flat[3 * i + 0], + soa_flat[3 * i + 1], + soa_flat[3 * i + 2], + X, Y, Z, bbox, Symmetry); + } + } + if (profile) { + cuda_profile_sync(); + bc_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (use_em_zero_fast) + gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny); + else + gpu_em_rk4_finalize_batch(ctx, all, dT, RK4, chitiny); + if (Lev > 0) { + if (use_em_zero_fast) + gpu_copy_patch_boundary_batch((int)all, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + else + gpu_em_restore_patch_boundary_batch(ctx, (int)all, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } + if (profile) { + cuda_profile_sync(); + finalize_ms += cuda_profile_now_ms() - t0; + } + + t0 = profile ? cuda_profile_now_ms() : 0.0; + if (use_resident_state) { + ctx.resident_valid[output_bank] = true; + ctx.resident_age[output_bank] = ++ctx.resident_clock; + set_resident_host_clean(ctx, output_bank, false); + mark_resident_current_bank(ctx, output_bank); + update_state_ready(ctx); + } else { + download_em_state_outputs(state_host_out, all); + } + if (profile) { + cuda_profile_sync(); + output_ms += cuda_profile_now_ms() - t0; + CudaProfileStats &stats = cuda_profile_stats(); + stats.calls++; + stats.total_ms += cuda_profile_now_ms() - t_total0; + stats.state_ms += state_ms; + stats.matter_ms += em_ms; + stats.rhs_ms += rhs_ms; + stats.bc_ms += bc_ms; + stats.finalize_ms += finalize_ms; + stats.output_ms += output_ms; + cuda_profile_maybe_log(); + } + if (RK4 == 3) + ctx.matter_ready = false; + return 0; +} + +extern "C" +int bssn_em_cuda_resident_zero_fast_state(void *block_tag) +{ + auto it = g_step_ctx.find(block_tag); + if (it == g_step_ctx.end()) + return 0; + const StepContext &ctx = it->second; + return (ctx.em_zero_fast_known && ctx.em_zero_fast) ? 1 : 0; +} + extern "C" int bssn_cuda_copy_state_region_to_host(void *block_tag, int state_index, @@ -8143,7 +9019,7 @@ int bssn_cuda_upload_resident_state_count(void *block_tag, { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; upload_resident_state_count(block_tag, ex, state_host_in, state_count); return 0; @@ -8166,7 +9042,7 @@ int bssn_cuda_keep_only_resident_state_count(void *block_tag, { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; keep_only_resident_state_count(block_tag, ex, state_host_key, state_count); return 0; @@ -8189,7 +9065,7 @@ int bssn_cuda_download_resident_state_count_if_present(void *block_tag, { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; download_resident_state_count_if_present(block_tag, ex, state_host_out, state_count); return 0; @@ -8392,7 +9268,7 @@ int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_ta init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); if (!state_host_key || - (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)) + state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex, i0, j0, k0, sx, sy, sz, @@ -8986,7 +9862,7 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); const bool profile = cuda_aux_profile_enabled(); const double t0 = profile ? cuda_profile_now_ms() : 0.0; - if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; if (source_count != 2 && source_count != 3) return 1; if (!resident_key_usable_count(src1_host_key, state_count) || @@ -9029,6 +9905,15 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, : -1; dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all, src1_bank, src2_bank, src3_bank); + } else if (state_count == BSSN_EM_STATE_COUNT) { + src1_bank = ensure_em_resident_bank(ctx, src1_host_key, all, true); + src2_bank = ensure_em_resident_bank(ctx, src2_host_key, all, true, src1_bank); + src3_bank = (source_count == 3) + ? ensure_em_resident_bank_avoiding(ctx, src3_host_key, all, true, + src1_bank, src2_bank, -1) + : -1; + dst_bank = reserve_em_resident_output_bank_avoiding(ctx, dst_host_key, all, + src1_bank, src2_bank, src3_bank); } else { src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true); src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 946137d..9995bb5 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -8,6 +8,8 @@ extern "C" { enum { BSSN_CUDA_STATE_COUNT = 24, BSSN_ESCALAR_CUDA_STATE_COUNT = 26, + BSSN_EM_CUDA_STATE_COUNT = 32, + BSSN_EM_CUDA_SOURCE_COUNT = 4, BSSN_CUDA_MATTER_COUNT = 10 }; @@ -82,6 +84,28 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double int &Lev, double &eps); +int bssn_em_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + double **source_host, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny); + +int bssn_em_cuda_resident_zero_fast_state(void *block_tag); + int bssn_cuda_copy_state_region_to_host(void *block_tag, int state_index, double *host_state, diff --git a/makefile_and_run.py b/makefile_and_run.py index c3ab492..494ea56 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -150,8 +150,11 @@ def _gpu_runtime_env(): "AMSS_CUDA_KEEP_ALL_LEVELS": "1", "AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1", "AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1", + "AMSS_CUDA_EM_CACHE_SOURCES": "1", + "AMSS_CUDA_EM_ZERO_FASTPATH": "1", + "AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1", "AMSS_CUDA_AMR_HOST_STAGED": "1", - "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1", + "AMSS_CUDA_AMR_RESTRICT_DEVICE": "0", "AMSS_CUDA_AMR_RESTRICT_BATCH": "0", "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1", @@ -287,6 +290,9 @@ def run_ABE(): print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}") print(f" AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}") print(f" AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}") + print(f" AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}") + print(f" AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}") + print(f" AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}") print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")