From 85fe29cc2ee710518b70d426aed7fe9958ad3666 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Tue, 5 May 2026 10:47:46 +0800 Subject: [PATCH] Optimize BSSN-EScalar CUDA path --- AMSS_NCKU_source/ABE.C | 6 +- AMSS_NCKU_source/Parallel.C | 161 +++-- AMSS_NCKU_source/bssnEScalar_class.C | 612 ++++++++++++++---- AMSS_NCKU_source/bssn_class.C | 340 +++++++++- AMSS_NCKU_source/bssn_class.h | 2 +- AMSS_NCKU_source/bssn_rhs_cuda.cu | 935 ++++++++++++++++++++++++--- AMSS_NCKU_source/bssn_rhs_cuda.h | 38 ++ AMSS_NCKU_source/makefile | 1 - makefile_and_run.py | 2 + 9 files changed, 1821 insertions(+), 276 deletions(-) diff --git a/AMSS_NCKU_source/ABE.C b/AMSS_NCKU_source/ABE.C index 9a4874e..82a30fe 100644 --- a/AMSS_NCKU_source/ABE.C +++ b/AMSS_NCKU_source/ABE.C @@ -484,7 +484,11 @@ int main(int argc, char *argv[]) cout << endl; } - delete ADM; + // Let the process teardown reclaim the simulation object. Some derived + // equation classes keep MPI/CUDA-backed state whose destructor ordering + // is fragile at program shutdown. + if (getenv("AMSS_DELETE_ADM_ON_EXIT")) + delete ADM; //=======================caculation done============================================================= diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 3174cc2..8c7d3fd 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -18,6 +18,7 @@ #endif #if USE_CUDA_BSSN #include "bssn_rhs_cuda.h" +#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT #endif #if USE_CUDA_Z4C #include "z4c_rhs_cuda.h" @@ -179,10 +180,12 @@ bool cuda_build_bssn_host_views(Block *block, int state_count, double **views) { - if (!block || !vars || !views || state_count != BSSN_CUDA_STATE_COUNT) + if (!block || !vars || !views || + (state_count != BSSN_CUDA_STATE_COUNT && + state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)) return false; MyList *v = vars; - for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) + for (int i = 0; i < state_count; ++i) { if (!v) return false; @@ -196,10 +199,12 @@ bool cuda_build_bssn_soa(MyList *vars, int state_count, double *soa_flat) { - if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT) + if (!vars || !soa_flat || + (state_count != BSSN_CUDA_STATE_COUNT && + state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)) return false; MyList *v = vars; - for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) + for (int i = 0; i < state_count; ++i) { if (!v) return false; @@ -317,7 +322,7 @@ bool cuda_state_count_direct_supported(int state_count) #if USE_CUDA_Z4C && (ABEtype == 2) return state_count == Z4C_CUDA_STATE_COUNT; #elif USE_CUDA_BSSN - return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; + return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT; #else (void)state_count; return false; @@ -372,22 +377,68 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) #endif } +bool cuda_amr_host_staged_enabled(); +double *alloc_device_comm_buffer(int length); +void free_device_comm_buffer(double *&ptr); + +bool cuda_direct_pack_segment_to_device(double *buffer, + const Parallel::gridseg *src, + const Parallel::gridseg *dst, + int state_count, + int type, + MyList *VarLists, + int Symmetry); + bool cuda_direct_pack_segment(double *buffer, const Parallel::gridseg *src, const Parallel::gridseg *dst, int state_count, - MyList *VarLists) + int type, + MyList *VarLists, + int Symmetry) { #if USE_CUDA_Z4C && (ABEtype == 2) if (state_count != Z4C_CUDA_STATE_COUNT) return false; #elif USE_CUDA_BSSN - if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; #else return false; #endif const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + + if (type == 2 || type == 3) + { +#if USE_CUDA_BSSN + if (!cuda_amr_host_staged_enabled()) + return false; + const int region_all = dst->shape[0] * dst->shape[1] * dst->shape[2]; + const int total = state_count * region_all; + static double *stage_dev = 0; + static int stage_cap = 0; + if (total > stage_cap) + { + free_device_comm_buffer(stage_dev); + stage_dev = alloc_device_comm_buffer(total); + stage_cap = total; + } + if (!cuda_direct_pack_segment_to_device(stage_dev, src, dst, state_count, type, VarLists, Symmetry)) + return false; + cudaError_t cerr = cudaMemcpy(buffer, stage_dev, (size_t)total * sizeof(double), cudaMemcpyDeviceToHost); + if (cerr != cudaSuccess) + { + fprintf(stderr, "Parallel: CUDA host-staged AMR pack cudaMemcpy failed, err=%d\n", (int)cerr); + return false; + } + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + return true; +#else + return false; +#endif + } + const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int k0 = cuda_seg_begin(dst, src->Bg, 2); @@ -396,7 +447,7 @@ bool cuda_direct_pack_segment(double *buffer, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else - double *views[BSSN_CUDA_STATE_COUNT]; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); const bool ok = have_views ? bssn_cuda_pack_state_batch_to_host_buffer_for_host_views( @@ -422,7 +473,7 @@ bool cuda_direct_unpack_segment(double *buffer, if (state_count != Z4C_CUDA_STATE_COUNT) return false; #elif USE_CUDA_BSSN - if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; #else return false; @@ -436,7 +487,7 @@ bool cuda_direct_unpack_segment(double *buffer, i0, j0, k0, dst->shape[0], dst->shape[1], dst->shape[2]) == 0; #else - double *views[BSSN_CUDA_STATE_COUNT]; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); const bool ok = have_views ? bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views( @@ -464,6 +515,17 @@ bool cuda_aware_mpi_enabled() return enabled != 0; } +bool cuda_cached_device_buffers_enabled(int state_count) +{ +#if USE_CUDA_BSSN + if (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT) + return false; +#else + (void)state_count; +#endif + return cuda_aware_mpi_enabled(); +} + bool cuda_amr_restrict_device_enabled() { static int enabled = -1; @@ -486,6 +548,17 @@ bool cuda_amr_prolong_device_enabled() return enabled != 0; } +bool cuda_amr_host_staged_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_HOST_STAGED"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + bool cuda_amr_restrict_compare_enabled() { static int enabled = -1; @@ -627,12 +700,12 @@ bool cuda_direct_pack_segment_to_device(double *buffer, } #endif #if USE_CUDA_BSSN - if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; bool ok = false; - double *views[BSSN_CUDA_STATE_COUNT]; - double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat); if (type == 1) @@ -812,13 +885,13 @@ bool cuda_direct_unpack_segment_from_device(double *buffer, } #endif #if USE_CUDA_BSSN - if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + if (state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int k0 = cuda_seg_begin(dst, dst->Bg, 2); - double *views[BSSN_CUDA_STATE_COUNT]; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); const bool ok = have_views ? bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views( @@ -843,12 +916,12 @@ bool cuda_download_resident_subset_to_host(Block *block, int state_count) { #if USE_CUDA_BSSN - if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + if (!block || state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; if (bssn_cuda_has_resident_state(block) == 0) return true; - int indices[BSSN_CUDA_STATE_COUNT]; - double *views[BSSN_CUDA_STATE_COUNT]; + int indices[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; MyList *v = vars; for (int i = 0; i < state_count; ++i) { @@ -871,7 +944,7 @@ bool cuda_unpack_host_region_to_resident(Block *block, const Parallel::gridseg *dst) { #if USE_CUDA_BSSN - if (!block || !dst || state_index < 0 || state_index >= BSSN_CUDA_STATE_COUNT) + if (!block || !dst || state_index < 0 || state_index >= AMSS_BSSN_CUDA_MAX_STATE_COUNT) return false; if (bssn_cuda_has_resident_state(block) == 0) return true; @@ -895,7 +968,7 @@ bool cuda_device_state_count_supported(int state_count) return true; #endif #if USE_CUDA_BSSN - return state_count > 0 && state_count <= BSSN_CUDA_STATE_COUNT; + return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT; #else (void)state_count; return false; @@ -915,8 +988,8 @@ bool cuda_flush_device_segment_batch(Block *block, return true; const int stride = (dir == PACK && type == 3) ? 11 : 8; const int segment_count = (int)(meta.size() / stride); - double *views[BSSN_CUDA_STATE_COUNT]; - double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + double *views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double soa_flat[3 * AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat); if (dir == PACK) @@ -5022,14 +5095,17 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) { if (s_cuda_aware_pack_active) { handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry); } else { - handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, VarLists); + handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count, type, VarLists, Symmetry); } if (!handled_by_cuda) { @@ -5037,7 +5113,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) { @@ -5102,7 +5178,8 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg)) { - if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) + if (type != 2 && type != 3 && + !cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) { cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; MPI_Abort(MPI_COMM_WORLD, 1); @@ -5775,7 +5852,7 @@ void Parallel::transfer_cached(MyList **src, MyList *PatL, MyList *VarList, int Symmetr cout << "Parallel::Sync_start: variable lists do not match." << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - if (cuda_aware_mpi_enabled()) + if (cuda_cached_device_buffers_enabled(state_count)) { for (int n = 0; n < cpusize; n++) { @@ -6976,16 +7053,16 @@ void Parallel::prepare_inter_time_level(Patch *Pat, if (myrank == cg->rank) { #if USE_CUDA_BSSN - double *src1_views[BSSN_CUDA_STATE_COUNT]; - double *src2_views[BSSN_CUDA_STATE_COUNT]; - double *dst_views[BSSN_CUDA_STATE_COUNT]; + double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const int state_count = cuda_state_var_count(VarList1, VarList2); - if (state_count == BSSN_CUDA_STATE_COUNT && + if (cuda_state_count_direct_supported(state_count) && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) && bssn_cuda_has_resident_state(cg) && - bssn_cuda_prepare_inter_time_level(cg, cg->shape, + bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count, src1_views, src2_views, 0, dst_views, 2, tindex) == 0) { @@ -7051,18 +7128,18 @@ void Parallel::prepare_inter_time_level(Patch *Pat, if (myrank == cg->rank) { #if USE_CUDA_BSSN - double *src1_views[BSSN_CUDA_STATE_COUNT]; - double *src2_views[BSSN_CUDA_STATE_COUNT]; - double *src3_views[BSSN_CUDA_STATE_COUNT]; - double *dst_views[BSSN_CUDA_STATE_COUNT]; + double *src1_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double *src2_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double *src3_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; + double *dst_views[AMSS_BSSN_CUDA_MAX_STATE_COUNT]; const int state_count = cuda_state_var_count(VarList1, VarList2); - if (state_count == BSSN_CUDA_STATE_COUNT && + if (cuda_state_count_direct_supported(state_count) && cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) && bssn_cuda_has_resident_state(cg) && - bssn_cuda_prepare_inter_time_level(cg, cg->shape, + bssn_cuda_prepare_inter_time_level(cg, cg->shape, state_count, src1_views, src2_views, src3_views, dst_views, 3, tindex) == 0) { @@ -7500,6 +7577,8 @@ void Parallel::Restrict_cached(MyList *PatcL, MyList *PatfL, cache.tc_req_is_recv = new int[cache.max_reqs]; cache.tc_completed = new int[cache.max_reqs]; } + for (int i = 0; i < cpusize; i++) + cache.combined_src[i] = cache.combined_dst[i] = 0; MyList *dst = build_complete_gsl(PatcL); for (int node = 0; node < cpusize; node++) @@ -7561,6 +7640,8 @@ void Parallel::OutBdLow2Hi_cached(MyList *PatcL, MyList *PatfL, cache.tc_req_is_recv = new int[cache.max_reqs]; cache.tc_completed = new int[cache.max_reqs]; } + for (int i = 0; i < cpusize; i++) + cache.combined_src[i] = cache.combined_dst[i] = 0; MyList *dst = build_buffer_gsl(PatfL); for (int node = 0; node < cpusize; node++) @@ -7613,6 +7694,8 @@ void Parallel::OutBdLow2Himix_cached(MyList *PatcL, MyList *PatfL, cache.tc_req_is_recv = new int[cache.max_reqs]; cache.tc_completed = new int[cache.max_reqs]; } + for (int i = 0; i < cpusize; i++) + cache.combined_src[i] = cache.combined_dst[i] = 0; MyList *dst = build_buffer_gsl(PatfL); for (int node = 0; node < cpusize; node++) diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index 48c70bc..37ce797 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -24,16 +24,165 @@ using namespace std; #include "sommerfeld_rout.h" #include "getnp4.h" #include "shellfunctions.h" -#include "parameters.h" +#include "parameters.h" +#if USE_CUDA_BSSN +#include "bssn_rhs_cuda.h" +#endif #ifdef With_AHF #include "derivatives.h" #include "myglobal.h" #endif -//================================================================================================ - -// Define bssnEScalar_class +//================================================================================================ + +namespace +{ +#if USE_CUDA_BSSN +bool fill_bssn_escalar_cuda_views(Block *cg, MyList *vars, + double **host_views, + double *propspeeds = 0, + double *soa_flat = 0) +{ + int idx = 0; + while (vars && idx < BSSN_ESCALAR_CUDA_STATE_COUNT) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + if (propspeeds) + propspeeds[idx] = vars->data->propspeed; + if (soa_flat) + { + soa_flat[3 * idx + 0] = vars->data->SoA[0]; + soa_flat[3 * idx + 1] = vars->data->SoA[1]; + soa_flat[3 * idx + 2] = vars->data->SoA[2]; + } + vars = vars->next; + ++idx; + } + return idx == BSSN_ESCALAR_CUDA_STATE_COUNT && vars == 0; +} + +bool bssn_escalar_cuda_use_resident_sync(int lev) +{ +#ifdef WithShell + (void)lev; + return false; +#else + return true; +#endif +} + +bool bssn_escalar_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) +{ + static int keep_all_levels = -1; + if (keep_all_levels < 0) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS"); + keep_all_levels = (env && atoi(env) != 0) ? 1 : 0; + } + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + if (!enabled) + return false; + if (lev == analysis_lev) + return false; + if (keep_all_levels) + return true; + return lev < trfls_in; +} + +bool bssn_escalar_sync_merged_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ESCALAR_SYNC_MERGED"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +void bssn_escalar_sync_level(MyList *PatL, MyList *VarList, int Symmetry) +{ + if (bssn_escalar_sync_merged_enabled()) + Parallel::Sync_merged(PatL, VarList, Symmetry); + else + Parallel::Sync(PatL, VarList, Symmetry); +} + +bool bssn_escalar_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ESCALAR_STEP_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +void bssn_escalar_timing_report(int myrank, int lev, int YN, double total, double rhs, + double sync, double bh, double analysis, double swap, + double resident, double rp) +{ + if (!bssn_escalar_timing_enabled()) + return; + + double local[8] = {total, rhs, sync, bh, analysis, swap, resident, rp}; + double maxv[8] = {}; + MPI_Reduce(local, maxv, 8, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (myrank == 0) + fprintf(stderr, + "[AMSS-ESCALAR-STEP] lev=%d YN=%d total=%.6f rhs=%.6f sync=%.6f " + "bh=%.6f analysis=%.6f swap=%.6f resident=%.6f rp=%.6f other=%.6f\n", + lev, YN, maxv[0], maxv[1], maxv[2], maxv[3], maxv[4], maxv[5], + maxv[6], maxv[7], + maxv[0] - maxv[1] - maxv[2] - maxv[3] - maxv[4] - maxv[5] - maxv[6] - maxv[7]); +} + +void bssn_escalar_cuda_download_level_state(MyList *PatL, MyList *vars, + int myrank, bool release_ctx) +{ + MyList *Pp = PatL; + while (Pp) + { + MyList *BP = Pp->data->blb; + while (BP) + { + Block *cg = BP->data; + if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) + { + double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_escalar_cuda_views(cg, vars, state_out)) + { + cout << "CUDA BSSN-EScalar resident state list mismatch during download" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)) + { + cout << "CUDA BSSN-EScalar resident state download failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + if (release_ctx) + bssn_cuda_release_step_ctx(cg); + } + if (BP == Pp->data->ble) + break; + BP = BP->next; + } + Pp = Pp->next; + } +} +#endif +} + +//================================================================================================ + +// Define bssnEScalar_class // It inherits some members and methods from the parent class bssn_class and modifies others. // The modified members and methods are defined below (and in the header bssnEScalar_class.h). @@ -177,11 +326,16 @@ void bssnEScalar_class::Initialize() //================================================================================================ -bssnEScalar_class::~bssnEScalar_class() -{ - delete Sphio; - delete Spio; - delete Sphi0; +bssnEScalar_class::~bssnEScalar_class() +{ +#if USE_CUDA_BSSN + for (int lev = 0; GH && lev < GH->levels; ++lev) + bssn_escalar_cuda_download_level_state(GH->PatL[lev], StateList, myrank, true); +#endif + + delete Sphio; + delete Spio; + delete Sphi0; delete Spi0; delete Sphi; delete Spi; @@ -707,7 +861,12 @@ void bssnEScalar_class::Read_Pablo() void bssnEScalar_class::Step(int lev, int YN) { - double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +#if USE_CUDA_BSSN + const bool use_cuda_resident_sync = bssn_escalar_cuda_use_resident_sync(lev); +#else + const bool use_cuda_resident_sync = false; +#endif #ifdef With_AHF AH_Step_Find(lev, dT_lev); #endif @@ -716,13 +875,23 @@ void bssnEScalar_class::Step(int lev, int YN) if (lev < GH->movls) ndeps = numepsb; double TRK4 = PhysTime; - int iter_count = 0; // count RK4 substeps - int pre = 0, cor = 1; - int ERROR = 0; - - MyList *sPp; - // Predictor - MyList *Pp = GH->PatL[lev]; + int iter_count = 0; // count RK4 substeps + int pre = 0, cor = 1; + int ERROR = 0; + const bool escalar_step_timing = bssn_escalar_timing_enabled(); + const double escalar_step_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + double escalar_t_rhs = 0.0; + double escalar_t_sync = 0.0; + double escalar_t_bh = 0.0; + double escalar_t_analysis = 0.0; + double escalar_t_swap = 0.0; + double escalar_t_resident = 0.0; + double escalar_t_rp = 0.0; + + MyList *sPp; + // Predictor + double escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + MyList *Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -731,15 +900,60 @@ void bssnEScalar_class::Step(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], - cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); -#endif - - if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], +#if (AGM == 0) +#if !USE_CUDA_BSSN + f_enforce_ga(cg->shape, + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); +#endif +#endif + + bool used_gpu_substep = false; +#if USE_CUDA_BSSN + { + double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in, propspeed, soa_flat) || + !fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_out)) + { + cout << "CUDA BSSN-EScalar state list mismatch on predictor step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#endif +#if (SommerType == 0) +#ifndef WithShell + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#endif + int keep_resident_state = use_cuda_resident_sync ? 1 : 0; + if (bssn_escalar_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, pre, + keep_resident_state, apply_enforce_ga, chitiny)) + { + cout << "CUDA BSSN-EScalar predictor substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + used_gpu_substep = true; + } +#endif + + if (!used_gpu_substep && + f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], @@ -783,9 +997,11 @@ void bssnEScalar_class::Step(int lev, int YN) ERROR = 1; } - // rk4 substep and boundary - { - MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here + if (!used_gpu_substep) + { + // rk4 substep and boundary + { + MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here while (varl0) { #ifndef WithShell @@ -820,9 +1036,10 @@ void bssnEScalar_class::Step(int lev, int YN) varl = varl->next; varlrhs = varlrhs->next; } - } - f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); - } + } + f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); + } + } if (BP == Pp->data->ble) break; BP = BP->next; @@ -834,19 +1051,21 @@ void bssnEScalar_class::Step(int lev, int YN) int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } - if (ERROR) - { + if (ERROR) + { Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); if (myrank == 0) { if (ErrorMonitor->outfile) ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - -#ifdef WithShell + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (escalar_step_timing) + escalar_t_rhs += MPI_Wtime() - escalar_t0; + +#ifdef WithShell // evolve Shell Patches if (lev == 0) { @@ -993,7 +1212,14 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; +#if USE_CUDA_BSSN + bssn_escalar_sync_level(GH->PatL[lev], SynchList_pre, Symmetry); +#else + Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); +#endif + if (escalar_step_timing) + escalar_t_sync += MPI_Wtime() - escalar_t0; #ifdef WithShell if (lev == 0) @@ -1013,10 +1239,14 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { - compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; +#if USE_CUDA_BSSN + (void)use_cuda_resident_sync; +#endif + compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count); @@ -1041,19 +1271,29 @@ void bssnEScalar_class::Step(int lev, int YN) DG_List->insert(Sfy0); DG_List->insert(Sfz0); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); - DG_List->clearList(); - } - } - } + DG_List->clearList(); + } + } + if (escalar_step_timing) + escalar_t_bh += MPI_Wtime() - escalar_t0; + } // data analysis part // Warning NOTE: the variables1 are used as temp storege room - if (lev == a_lev) - { - AnalysisStuff_EScalar(lev, dT_lev); - } - // corrector - for (iter_count = 1; iter_count < 4; iter_count++) - { + if (lev == a_lev) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; +#if USE_CUDA_BSSN + if (use_cuda_resident_sync) + bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false); +#endif + AnalysisStuff_EScalar(lev, dT_lev); + if (escalar_step_timing) + escalar_t_analysis += MPI_Wtime() - escalar_t0; + } + // corrector + for (iter_count = 1; iter_count < 4; iter_count++) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; if (iter_count == 1 || iter_count == 3) TRK4 += dT_lev / 2; @@ -1066,22 +1306,67 @@ void bssnEScalar_class::Step(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#elif (AGM == 1) - if (iter_count == 3) - f_enforce_ga(cg->shape, +#if (AGM == 0) +#if !USE_CUDA_BSSN + f_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); +#endif +#elif (AGM == 1) + if (iter_count == 3) + f_enforce_ga(cg->shape, cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#endif - - if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], +#endif + + bool used_gpu_substep = false; +#if USE_CUDA_BSSN + { + double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double propspeed[BSSN_ESCALAR_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_escalar_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) || + !fill_bssn_escalar_cuda_views(cg, SynchList_cor, state_out)) + { + cout << "CUDA BSSN-EScalar state list mismatch on corrector step" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + int apply_bam_bc = 0; + int apply_enforce_ga = 0; +#if (AGM == 0) + apply_enforce_ga = 1; +#endif +#if (SommerType == 0) +#ifndef WithShell + apply_bam_bc = (lev == 0) ? 1 : 0; +#endif +#endif + int keep_resident_state = use_cuda_resident_sync ? 1 : 0; + if (bssn_escalar_cuda_rk4_substep(cg, + cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, state_out, + propspeed, soa_flat, Pp->data->bbox, + dT_lev, TRK4, iter_count, apply_bam_bc, + Symmetry, lev, ndeps, cor, + keep_resident_state, apply_enforce_ga, chitiny)) + { + cout << "CUDA BSSN-EScalar corrector substep failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + used_gpu_substep = true; + } +#endif + + if (!used_gpu_substep && + f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], @@ -1125,9 +1410,11 @@ void bssnEScalar_class::Step(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } - // rk4 substep and boundary - { - MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; + if (!used_gpu_substep) + { + // rk4 substep and boundary + { + MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // we do not check the correspondence here while (varl0) @@ -1165,9 +1452,10 @@ void bssnEScalar_class::Step(int lev, int YN) varl1 = varl1->next; varlrhs = varlrhs->next; } - } - f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); - } + } + f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); + } + } if (BP == Pp->data->ble) break; BP = BP->next; @@ -1180,8 +1468,8 @@ void bssnEScalar_class::Step(int lev, int YN) int erh = ERROR; MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD); } - if (ERROR) - { + if (ERROR) + { Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev); if (myrank == 0) { @@ -1189,11 +1477,13 @@ void bssnEScalar_class::Step(int lev, int YN) ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count << " variables at t = " << PhysTime << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - -#ifdef WithShell + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + if (escalar_step_timing) + escalar_t_rhs += MPI_Wtime() - escalar_t0; + +#ifdef WithShell // evolve Shell Patches if (lev == 0) { @@ -1349,7 +1639,14 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; +#if USE_CUDA_BSSN + bssn_escalar_sync_level(GH->PatL[lev], SynchList_cor, Symmetry); +#else + Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); +#endif + if (escalar_step_timing) + escalar_t_sync += MPI_Wtime() - escalar_t0; #ifdef WithShell if (lev == 0) @@ -1368,10 +1665,14 @@ void bssnEScalar_class::Step(int lev, int YN) } } #endif - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { - compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; +#if USE_CUDA_BSSN + (void)use_cuda_resident_sync; +#endif + compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count); @@ -1396,14 +1697,17 @@ void bssnEScalar_class::Step(int lev, int YN) DG_List->insert(Sfy0); DG_List->insert(Sfz0); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); - DG_List->clearList(); - } - } - } - // swap time level - if (iter_count < 3) - { - Pp = GH->PatL[lev]; + DG_List->clearList(); + } + } + if (escalar_step_timing) + escalar_t_bh += MPI_Wtime() - escalar_t0; + } + // swap time level + if (iter_count < 3) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -1444,16 +1748,32 @@ void bssnEScalar_class::Step(int lev, int YN) Porg[ithBH][0] = Porg1[ithBH][0]; Porg[ithBH][1] = Porg1[ithBH][1]; Porg[ithBH][2] = Porg1[ithBH][2]; - } - } - } - } - -#if (RPS == 0) - // mesh refinement boundary part - RestrictProlong(lev, YN, BB); - -#ifdef WithShell + } + } + if (escalar_step_timing) + escalar_t_swap += MPI_Wtime() - escalar_t0; + } + } + +#if USE_CUDA_BSSN + if (use_cuda_resident_sync) + { + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + if (!bssn_escalar_cuda_keep_resident_after_step(lev, trfls, a_lev)) + bssn_escalar_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + if (escalar_step_timing) + escalar_t_resident += MPI_Wtime() - escalar_t0; + } +#endif + +#if (RPS == 0) + // mesh refinement boundary part + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + RestrictProlong(lev, YN, BB); + if (escalar_step_timing) + escalar_t_rp += MPI_Wtime() - escalar_t0; + +#ifdef WithShell if (lev == 0) { clock_t prev_clock, curr_clock; @@ -1477,8 +1797,9 @@ void bssnEScalar_class::Step(int lev, int YN) // StateList 0 ----------- // // OldStateList old ----------- - // update - Pp = GH->PatL[lev]; + // update + escalar_t0 = escalar_step_timing ? MPI_Wtime() : 0.0; + Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -1520,10 +1841,18 @@ void bssnEScalar_class::Step(int lev, int YN) { Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -} + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } + if (escalar_step_timing) + { + escalar_t_swap += MPI_Wtime() - escalar_t0; + bssn_escalar_timing_report(myrank, lev, YN, MPI_Wtime() - escalar_step_t0, + escalar_t_rhs, escalar_t_sync, escalar_t_bh, + escalar_t_analysis, escalar_t_swap, + escalar_t_resident, escalar_t_rp); + } +} //================================================================================================ @@ -2074,14 +2403,44 @@ void bssnEScalar_class::Constraint_Out() MyList *BP = Pp->data->blb; while (BP) { - Block *cg = BP->data; - if (myrank == cg->rank) - { - if (lev > 0) - f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], - cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + Block *cg = BP->data; + if (myrank == cg->rank) + { + bool used_cuda_constraints = false; +#if USE_CUDA_BSSN + { + double *state_in[BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_escalar_cuda_views(cg, StateList, state_in)) + { + cout << "CUDA BSSN-EScalar constraint state list mismatch" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + double *constraint_out[8] = { + cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], + cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], + cg->fgfs[Cons_Gz->sgfn], cg->fgfs[Cons_fR->sgfn]}; + int lev_arg = lev; + int sym_arg = Symmetry; + double eps_arg = ndeps; + if (bssn_escalar_cuda_compute_constraints(cg->shape, cg->X[0], cg->X[1], cg->X[2], + state_in, constraint_out, + sym_arg, lev_arg, eps_arg)) + { + cout << "CUDA BSSN-EScalar constraint compute failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + used_cuda_constraints = true; + } +#endif + if (!used_cuda_constraints && lev > 0) + f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn], @@ -2110,15 +2469,16 @@ void bssnEScalar_class::Constraint_Out() cg->fgfs[Gamzyy->sgfn], cg->fgfs[Gamzyz->sgfn], cg->fgfs[Gamzzz->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], cg->fgfs[Ryy->sgfn], cg->fgfs[Ryz->sgfn], cg->fgfs[Rzz->sgfn], - cg->fgfs[Cons_Ham->sgfn], - cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], - cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], - Symmetry, lev, ndeps, pre); - f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2], - cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], - cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn], - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Cons_Ham->sgfn], + cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], + Symmetry, lev, ndeps, pre); + if (!used_cuda_constraints) + f_compute_constraint_fr(cg->shape, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], + cg->fgfs[rho->sgfn], cg->fgfs[Sphi0->sgfn], + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn], cg->fgfs[Rxx->sgfn], cg->fgfs[Rxy->sgfn], cg->fgfs[Rxz->sgfn], diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 04576c3..123a175 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -70,6 +70,125 @@ int amss_analysis_map_every() return every; } +bool amss_rp_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_RP_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool amss_rp_detail_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_RP_DETAIL_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool amss_env_flag_enabled(const char *name) +{ + const char *env = getenv(name); + return env && atoi(env) != 0; +} + +bool amss_cached_rp_restrict_enabled() +{ + static int enabled = -1; + if (enabled < 0) + enabled = amss_env_flag_enabled("AMSS_RP_CACHED_RESTRICT") ? 1 : 0; + return enabled != 0; +} + +bool amss_cached_rp_outbd_enabled() +{ + static int enabled = -1; + if (enabled < 0) + enabled = amss_env_flag_enabled("AMSS_RP_CACHED_OUTBD") ? 1 : 0; + return enabled != 0; +} + +bool amss_cached_rp_fine_sync_enabled() +{ + static int enabled = -1; + if (enabled < 0) + enabled = amss_env_flag_enabled("AMSS_RP_CACHED_FINE_SYNC") ? 1 : 0; + return enabled != 0; +} + +bool amss_cached_rp_coarse_sync_enabled() +{ + static int enabled = -1; + if (enabled < 0) + enabled = amss_env_flag_enabled("AMSS_RP_CACHED_COARSE_SYNC") ? 1 : 0; + return enabled != 0; +} + +bool amss_rp_skip_coarse_sync_enabled() +{ + static int enabled = -1; + if (enabled < 0) + enabled = amss_env_flag_enabled("AMSS_RP_SKIP_COARSE_SYNC") ? 1 : 0; + return enabled != 0; +} + +bool amss_evolve_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + enabled = amss_env_flag_enabled("AMSS_EVOLVE_TIMING") ? 1 : 0; + return enabled != 0; +} + +struct AmssEvolveTimingStats +{ + double step; + double rp; + double regrid; + double constraint; +}; + +AmssEvolveTimingStats &amss_evolve_timing_stats() +{ + static AmssEvolveTimingStats stats = {}; + return stats; +} + +void amss_evolve_timing_reset() +{ + AmssEvolveTimingStats &stats = amss_evolve_timing_stats(); + stats.step = 0.0; + stats.rp = 0.0; + stats.regrid = 0.0; + stats.constraint = 0.0; +} + +void amss_evolve_timing_add_step(double sec) +{ + amss_evolve_timing_stats().step += sec; +} + +void amss_evolve_timing_add_rp(double sec) +{ + amss_evolve_timing_stats().rp += sec; +} + +void amss_evolve_timing_add_regrid(double sec) +{ + amss_evolve_timing_stats().regrid += sec; +} + +void amss_evolve_timing_add_constraint(double sec) +{ + amss_evolve_timing_stats().constraint += sec; +} + } // Compile-time switch for per-timestep memory usage collection/printing. @@ -288,6 +407,37 @@ bool fill_bssn_cuda_views(Block *cg, MyList *vars, return idx == BSSN_CUDA_STATE_COUNT && vars == 0; } +int count_bssn_cuda_state_list(MyList *vars) +{ + int count = 0; + while (vars) + { + ++count; + vars = vars->next; + if (count > BSSN_ESCALAR_CUDA_STATE_COUNT) + return -1; + } + return count; +} + +bool fill_bssn_cuda_views_count(Block *cg, MyList *vars, + int state_count, + double **host_views) +{ + if (!cg || !host_views || + (state_count != BSSN_CUDA_STATE_COUNT && + state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)) + return false; + int idx = 0; + while (vars && idx < state_count) + { + host_views[idx] = cg->fgfs[vars->data->sgfn]; + vars = vars->next; + ++idx; + } + return idx == state_count && vars == 0; +} + bool bssn_cuda_use_resident_sync(int lev) { #ifdef WithShell @@ -467,6 +617,11 @@ bool bssn_cuda_interp_bh_point_resident(MyList *PatL, block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) { var *vars[3] = {forx, fory, forz}; + double *bh_host_key[3] = { + block->fgfs[forx->sgfn], + block->fgfs[fory->sgfn], + block->fgfs[forz->sgfn] + }; double soa3[9]; for (int f = 0; f < 3; f++) { @@ -482,6 +637,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList *PatL, DH[0], DH[1], DH[2], x, y, z, interp_ordn, interp_sym, + bh_host_key, soa3, shellf) != 0) { const int sx = ordn; @@ -552,6 +708,7 @@ bool bssn_cuda_interp_bh_point_resident(MyList *PatL, void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int myrank, bool release_ctx) { + const int state_count = count_bssn_cuda_state_list(vars); MyList *Pp = PatL; while (Pp) { @@ -561,13 +718,16 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int Block *cg = BP->data; if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { - double *state_out[BSSN_CUDA_STATE_COUNT]; - if (!fill_bssn_cuda_views(cg, vars, state_out)) + double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out)) { cout << "CUDA BSSN state list mismatch on resident state download" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - if (bssn_cuda_download_resident_state(cg, cg->shape, state_out)) + const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT) + ? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out) + : bssn_cuda_download_resident_state(cg, cg->shape, state_out); + if (rc) { cout << "CUDA resident state download failed" << endl; MPI_Abort(MPI_COMM_WORLD, 1); @@ -585,6 +745,7 @@ void bssn_cuda_download_level_state(MyList *PatL, MyList *vars, int void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList *vars, int myrank) { + const int state_count = count_bssn_cuda_state_list(vars); MyList *Pp = PatL; while (Pp) { @@ -594,13 +755,13 @@ void bssn_cuda_download_level_state_if_present(MyList *PatL, MyList Block *cg = BP->data; if (myrank == cg->rank && bssn_cuda_has_resident_state(cg)) { - double *state_out[BSSN_CUDA_STATE_COUNT]; - if (!fill_bssn_cuda_views(cg, vars, state_out)) + double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT]; + if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out)) { cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - if (bssn_cuda_download_resident_state_if_present(cg, cg->shape, state_out)) + if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count)) { cout << "CUDA resident state conditional download failed" << endl; MPI_Abort(MPI_COMM_WORLD, 1); @@ -2890,6 +3051,10 @@ void bssn_class::Evolve(int Steps) for (int ncount = 1; ncount < Steps + 1; ncount++) { + const bool evolve_timing = amss_evolve_timing_enabled(); + const double evolve_t0 = evolve_timing ? MPI_Wtime() : 0.0; + if (evolve_timing) + amss_evolve_timing_reset(); cuda_level0_constraint_cache_valid = false; #if BSSN_FINE_TIMING step_timing::reset(); @@ -2918,9 +3083,12 @@ void bssn_class::Evolve(int Steps) // misc::tillherecheck("before Constraint_Out"); + const double constraint_t0 = evolve_timing ? MPI_Wtime() : 0.0; STEP_TIMER_DECL(timer_constraint_out); Constraint_Out(); // this will affect the Dump_List STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out); + if (evolve_timing) + amss_evolve_timing_add_constraint(MPI_Wtime() - constraint_t0); LastDump += dT_mon; Last2dDump += dT_mon; @@ -3093,6 +3261,22 @@ void bssn_class::Evolve(int Steps) if (ncount % BSSN_FINE_TIMING_EVERY == 0) rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start); #endif + if (evolve_timing) + { + const AmssEvolveTimingStats &stats = amss_evolve_timing_stats(); + const double local[4] = {stats.step, stats.rp, stats.regrid, stats.constraint}; + double maxv[4] = {}; + MPI_Reduce((void *)local, maxv, 4, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (myrank == 0) + { + const double wall = MPI_Wtime() - evolve_t0; + const double known = maxv[0] + maxv[1] + maxv[2] + maxv[3]; + fprintf(stderr, + "[AMSS-EVOLVE-TIMING] step=%d wall=%.6f step_fn=%.6f rp=%.6f " + "regrid=%.6f constraint=%.6f other=%.6f\n", + ncount, wall, maxv[0], maxv[1], maxv[2], maxv[3], wall - known); + } + } } /* #ifdef With_AHF @@ -3162,7 +3346,11 @@ void bssn_class::RecursiveStep(int lev) { // if(myrank==0) cout<<"level now = "<levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } #endif } + if (evolve_timing) + amss_evolve_timing_add_regrid(MPI_Wtime() - regrid_t0); STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel); #endif } @@ -6847,6 +7042,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // // SynchList_cor old ----------- { + const bool rp_runtime_timing = amss_rp_timing_enabled(); + const double rp_runtime_start = rp_runtime_timing ? MPI_Wtime() : 0.0; + const bool rp_detail_timing = amss_rp_detail_timing_enabled(); + double rp_t_prepare = 0.0; + double rp_t_restrict = 0.0; + double rp_t_coarse_sync = 0.0; + double rp_t_outbd = 0.0; + double rp_t_fine_sync = 0.0; + double rp_t0 = 0.0; STEP_TIMER_DECL(timer_restrict_prolong); #if (PSTR == 1 || PSTR == 2) // stringstream a_stream; @@ -6858,6 +7062,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, MyList *Pp, *Ppc; if (lev > trfls && YN == 0) // time refinement levels and for intermediat time level { + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Pp = GH->PatL[lev - 1]; while (Pp) { @@ -6873,6 +7078,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif Pp = Pp->next; } + if (rp_detail_timing) rp_t_prepare += MPI_Wtime() - rp_t0; #if (PSTR == 1 || PSTR == 2) // Pp=GH->PatL[lev]; @@ -6889,14 +7095,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); #if (ABEtype == 1 || ABEtype == 2) Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #endif + if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0; #elif (RPB == 1) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); + if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0; #endif #if (PSTR == 1 || PSTR == 2) @@ -6907,10 +7117,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (ABEtype == 1 || ABEtype == 2) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); + if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0; #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); + if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0; #endif #endif @@ -6922,6 +7136,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); #if (MIXOUTB == 0) #if (ABEtype == 1 || ABEtype == 2) Ppc = GH->PatL[lev - 1]; @@ -6941,9 +7156,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif + if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0; #elif (RPB == 1) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, GH->bdsul[lev], Symmetry); + if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0; #endif #if (PSTR == 1 || PSTR == 2) @@ -6964,14 +7182,18 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); #if (ABEtype == 1 || ABEtype == 2) Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); #endif + if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0; #elif (RPB == 1) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); + if (rp_detail_timing) rp_t_restrict += MPI_Wtime() - rp_t0; #endif #if (PSTR == 1 || PSTR == 2) @@ -6982,10 +7204,14 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (ABEtype == 1 || ABEtype == 2) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); + if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0; #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); + if (rp_detail_timing) rp_t_coarse_sync += MPI_Wtime() - rp_t0; #endif #endif @@ -6997,6 +7223,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); #if (MIXOUTB == 0) #if (ABEtype == 1 || ABEtype == 2) Ppc = GH->PatL[lev - 1]; @@ -7016,9 +7243,12 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif + if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0; #elif (RPB == 1) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); + if (rp_detail_timing) rp_t_outbd += MPI_Wtime() - rp_t0; #endif #if (PSTR == 1 || PSTR == 2) @@ -7030,9 +7260,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, } #if (ABEtype == 1 || ABEtype == 2) + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Parallel::Sync(GH->PatL[lev], SL, Symmetry); + if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0; #else + if (rp_detail_timing) rp_t0 = MPI_Wtime(); Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); + if (rp_detail_timing) rp_t_fine_sync += MPI_Wtime() - rp_t0; #endif #if (PSTR == 1 || PSTR == 2) @@ -7042,6 +7276,27 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif } + if (rp_runtime_timing) + { + const double local_sec = MPI_Wtime() - rp_runtime_start; + double max_sec = 0.0; + MPI_Reduce((void *)&local_sec, &max_sec, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (myrank == 0) + fprintf(stderr, "[AMSS-RP-TIMING] lev=%d YN=%d BB=%d sec=%.6f\n", + lev, YN, BB ? 1 : 0, max_sec); + } + if (rp_detail_timing) + { + double local_detail[5] = {rp_t_prepare, rp_t_restrict, rp_t_coarse_sync, rp_t_outbd, rp_t_fine_sync}; + double max_detail[5] = {}; + MPI_Reduce(local_detail, max_detail, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + if (myrank == 0) + fprintf(stderr, + "[AMSS-RP-DETAIL] lev=%d YN=%d BB=%d prepare=%.6f restrict=%.6f " + "coarse_sync=%.6f outbd=%.6f fine_sync=%.6f\n", + lev, YN, BB ? 1 : 0, max_detail[0], max_detail[1], + max_detail[2], max_detail[3], max_detail[4]); + } STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); } @@ -7229,7 +7484,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (ABEtype == 1 || ABEtype == 2) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); + if (amss_cached_rp_restrict_enabled()) + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); + else + Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); #endif @@ -7239,7 +7497,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #endif #if (ABEtype == 1 || ABEtype == 2) - Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); + if (amss_rp_skip_coarse_sync_enabled()) + { + } + else if (amss_cached_rp_coarse_sync_enabled()) + Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); + else + Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); @@ -7249,16 +7513,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) #if (ABEtype == 1 || ABEtype == 2) - Ppc = GH->PatL[lev - 1]; - while (Ppc) + if (amss_cached_rp_outbd_enabled()) { - Pp = GH->PatL[lev]; - while (Pp) + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); + } + else + { + Ppc = GH->PatL[lev - 1]; + while (Ppc) { - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); - Pp = Pp->next; + Pp = GH->PatL[lev]; + while (Pp) + { + Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); + Pp = Pp->next; + } + Ppc = Ppc->next; } - Ppc = Ppc->next; } #else Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); @@ -7277,7 +7548,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; #if (RPB == 0) #if (ABEtype == 1 || ABEtype == 2) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); + if (amss_cached_rp_restrict_enabled()) + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); + else + Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); #else Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); #endif @@ -7287,7 +7561,13 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #endif #if (ABEtype == 1 || ABEtype == 2) - Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); + if (amss_rp_skip_coarse_sync_enabled()) + { + } + else if (amss_cached_rp_coarse_sync_enabled()) + Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); + else + Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); #else #if (RP_SYNC_COARSE_AFTER_RESTRICT == 1) Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); @@ -7297,16 +7577,23 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #if (RPB == 0) #if (MIXOUTB == 0) #if (ABEtype == 1 || ABEtype == 2) - Ppc = GH->PatL[lev - 1]; - while (Ppc) + if (amss_cached_rp_outbd_enabled()) { - Pp = GH->PatL[lev]; - while (Pp) + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); + } + else + { + Ppc = GH->PatL[lev - 1]; + while (Ppc) { - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); - Pp = Pp->next; + Pp = GH->PatL[lev]; + while (Pp) + { + Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); + Pp = Pp->next; + } + Ppc = Ppc->next; } - Ppc = Ppc->next; } #else Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); @@ -7321,7 +7608,10 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) } #if (ABEtype == 1 || ABEtype == 2) - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + if (amss_cached_rp_fine_sync_enabled()) + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); + else + Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); #else Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); #endif diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index f799dc0..d390bd1 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -144,7 +144,7 @@ public: bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei, int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi, int a_levi, int maxli, int decni, double maxrexi, double drexi); - ~bssn_class(); + virtual ~bssn_class(); void Evolve(int Steps); void RecursiveStep(int lev); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 378c79e..da0ccd3 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -17,6 +17,18 @@ #include "macrodef.h" #include "bssn_rhs.h" +extern "C" { +#ifdef fortran1 +void set_escalar_parameter(double &, double &, double &, double &, double &); +#endif +#ifdef fortran2 +void SET_ESCALAR_PARAMETER(double &, double &, double &, double &, double &); +#endif +#ifdef fortran3 +void set_escalar_parameter_(double &, double &, double &, double &, double &); +#endif +} + /* ------------------------------------------------------------------ */ /* Multi-GPU dispatch: distribute ranks across available GPUs */ /* ------------------------------------------------------------------ */ @@ -401,7 +413,7 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src, */ /* Total number of "all"-sized slots */ -#define NUM_SLOTS 160 +#define NUM_SLOTS 192 struct GpuBuffers { double *d_mem; /* single big allocation */ @@ -468,6 +480,12 @@ enum { S_Gamxa, S_Gamya, S_Gamza, S_gupxx, S_gupxy, S_gupxz, S_gupyy, S_gupyz, S_gupzz, + S_Sphi, S_Spi, S_Sphi_rhs, S_Spi_rhs, + S_Sphi0, S_Spi0, S_Sphi_accum, S_Spi_accum, + S_Sphi_next, S_Spi_next, + S_Sphi_x, S_Sphi_y, S_Sphi_z, + S_Sphi_xx, S_Sphi_xy, S_Sphi_xz, S_Sphi_yy, S_Sphi_yz, S_Sphi_zz, + S_trK_x, S_trK_y, S_trK_z, NUM_USED_SLOTS }; @@ -485,6 +503,8 @@ static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_MATTER_COUNT = 10; static constexpr int BSSN_LK_FIELD_COUNT = 24; static constexpr int BSSN_RESIDENT_BANK_COUNT = 4; +static constexpr int BSSN_ESCALAR_STATE_COUNT = 26; +static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT; static const int k_state_input_slots[BSSN_STATE_COUNT] = { S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, @@ -503,6 +523,25 @@ static const int k_state_rhs_slots[BSSN_STATE_COUNT] = { S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs }; +static const int k_escalar_state_input_slots[BSSN_ESCALAR_STATE_COUNT] = { + S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, + S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz, + S_Gamx, S_Gamy, S_Gamz, + S_Lap, S_betax, S_betay, S_betaz, + S_dtSfx, S_dtSfy, S_dtSfz, + S_Sphi, S_Spi +}; + +static const int k_escalar_state_rhs_slots[BSSN_ESCALAR_STATE_COUNT] = { + S_chi_rhs, S_trK_rhs, + S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs, + S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs, + S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs, + S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs, + S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs, + S_Sphi_rhs, S_Spi_rhs +}; + static const int k_matter_slots[BSSN_MATTER_COUNT] = { S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz }; @@ -526,8 +565,8 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs }; -__constant__ int d_subset_state_indices[BSSN_STATE_COUNT]; -__constant__ double d_comm_state_soa[3 * BSSN_STATE_COUNT]; +__constant__ int d_subset_state_indices[BSSN_RESIDENT_STATE_CAPACITY]; +__constant__ double d_comm_state_soa[3 * BSSN_RESIDENT_STATE_CAPACITY]; static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { 1, 1, 1, @@ -565,13 +604,13 @@ struct StepContext { double *d_matter_mem; double *d_comm_mem; double *h_comm_mem; - std::array d_state0; - std::array d_accum; - std::array d_state_curr; - std::array d_state_next; - std::array, BSSN_RESIDENT_BANK_COUNT> d_resident; - std::array, BSSN_RESIDENT_BANK_COUNT> resident_host; - std::array, BSSN_RESIDENT_BANK_COUNT> resident_host_clean; + std::array d_state0; + std::array d_accum; + std::array d_state_curr; + std::array d_state_next; + std::array, BSSN_RESIDENT_BANK_COUNT> d_resident; + std::array, BSSN_RESIDENT_BANK_COUNT> resident_host; + std::array, BSSN_RESIDENT_BANK_COUNT> resident_host_clean; std::array resident_age; std::array resident_valid; std::array d_matter; @@ -802,18 +841,18 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) StepAllocation alloc = acquire_step_allocation(all); if (!has_step_allocation(alloc)) { - CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double))); for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b], - BSSN_STATE_COUNT * all * sizeof(double))); + BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double))); } CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); alloc.cap_all = all; } attach_step_allocation(ctx, alloc); } - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) { ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all; for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { @@ -897,14 +936,14 @@ static int *ensure_comm_segment_meta_buffer(size_t needed_ints) static void upload_comm_state_soa(const double *state_soa, int state_count) { - double soa[3 * BSSN_STATE_COUNT]; - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + double soa[3 * BSSN_RESIDENT_STATE_CAPACITY]; + for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) { soa[3 * i + 0] = 1.0; soa[3 * i + 1] = 1.0; soa[3 * i + 2] = 1.0; } if (state_soa) { - const int n = (state_count < BSSN_STATE_COUNT) ? state_count : BSSN_STATE_COUNT; + const int n = (state_count < BSSN_RESIDENT_STATE_CAPACITY) ? state_count : BSSN_RESIDENT_STATE_CAPACITY; for (int i = 0; i < n; ++i) { soa[3 * i + 0] = state_soa[3 * i + 0]; soa[3 * i + 1] = state_soa[3 * i + 1]; @@ -2535,6 +2574,138 @@ static void gpu_lopsided_kodis_state_batch(double eps_val, int all) g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val); } +__global__ __launch_bounds__(128, 4) +void kern_escalar_sources( + const double * __restrict__ Sphi, + const double * __restrict__ Spi, + const double * __restrict__ alpn1, + const double * __restrict__ chin1, + const double * __restrict__ gxx, + const double * __restrict__ gxy, + const double * __restrict__ gxz, + const double * __restrict__ gyy, + const double * __restrict__ gyz, + const double * __restrict__ gzz, + const double * __restrict__ gupxx, + const double * __restrict__ gupxy, + const double * __restrict__ gupxz, + const double * __restrict__ gupyy, + const double * __restrict__ gupyz, + const double * __restrict__ gupzz, + const double * __restrict__ chix, + const double * __restrict__ chiy, + const double * __restrict__ chiz, + const double * __restrict__ Lapx, + const double * __restrict__ Lapy, + const double * __restrict__ Lapz, + const double * __restrict__ trK, + const double * __restrict__ Gamx, + const double * __restrict__ Gamy, + const double * __restrict__ Gamz, + const double * __restrict__ Kx, + const double * __restrict__ Ky, + const double * __restrict__ Kz, + const double * __restrict__ fxx, + const double * __restrict__ fxy, + const double * __restrict__ fxz, + const double * __restrict__ fyy, + const double * __restrict__ fyz, + const double * __restrict__ fzz, + double * __restrict__ Sphi_rhs, + double * __restrict__ Spi_rhs, + double * __restrict__ rho, + double * __restrict__ Sx, + double * __restrict__ Sy, + double * __restrict__ Sz, + double * __restrict__ Sxx, + double * __restrict__ Sxy, + double * __restrict__ Sxz, + double * __restrict__ Syy, + double * __restrict__ Syz, + double * __restrict__ Szz) +{ + constexpr double PI_V = 3.141592653589793238462643383279502884; + constexpr double TWO = 2.0; + constexpr double HALF = 0.5; + constexpr double A2 = 3.0; + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; + i < d_gp.all; + i += blockDim.x * gridDim.x) { + const double c1 = chin1[i]; + const double a = alpn1[i]; + const double sx = Kx[i]; + const double sy = Ky[i]; + const double sz = Kz[i]; + const double sp = Spi[i]; + + const double uxx = gupxx[i]; + const double uxy = gupxy[i]; + const double uxz = gupxz[i]; + const double uyy = gupyy[i]; + const double uyz = gupyz[i]; + const double uzz = gupzz[i]; + + const double sqpi3 = sqrt(PI_V / 3.0); + const double e4 = exp(4.0 * sqpi3 * Sphi[i]); + const double em8 = exp(-8.0 * sqpi3 * Sphi[i]); + const double V = em8 * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_V * A2); + const double dV = (1.0 / A2 / 12.0) * sqrt(3.0 / PI_V) * em8 * (-1.0 + e4); + + Sphi_rhs[i] = a * sp; + + double pi_rhs = uxx * fxx[i] + uyy * fyy[i] + uzz * fzz[i] + + TWO * (uxy * fxy[i] + uxz * fxz[i] + uyz * fyz[i]); + pi_rhs -= (Gamx[i] + (uxx * chix[i] + uxy * chiy[i] + uxz * chiz[i]) / (TWO * c1)) * sx + + (Gamy[i] + (uxy * chix[i] + uyy * chiy[i] + uyz * chiz[i]) / (TWO * c1)) * sy + + (Gamz[i] + (uxz * chix[i] + uyz * chiy[i] + uzz * chiz[i]) / (TWO * c1)) * sz; + pi_rhs = pi_rhs * a + + (uxx * Lapx[i] * sx + uxy * Lapx[i] * sy + uxz * Lapx[i] * sz + + uxy * Lapy[i] * sx + uyy * Lapy[i] * sy + uyz * Lapy[i] * sz + + uxz * Lapz[i] * sx + uyz * Lapz[i] * sy + uzz * Lapz[i] * sz); + Spi_rhs[i] = pi_rhs * c1 + a * (trK[i] * sp - dV); + + const double grad = HALF * (uxx * sx * sx + uyy * sy * sy + uzz * sz * sz) + + uxy * sx * sy + uxz * sx * sz + uyz * sy * sz; + const double rho_v = c1 * grad + HALF * sp * sp + V; + rho[i] = rho_v; + Sx[i] = -sp * sx; + Sy[i] = -sp * sy; + Sz[i] = -sp * sz; + const double f = (rho_v - sp * sp) / c1; + Sxx[i] = sx * sx - f * gxx[i]; + Sxy[i] = sx * sy - f * gxy[i]; + Sxz[i] = sx * sz - f * gxz[i]; + Syy[i] = sy * sy - f * gyy[i]; + Syz[i] = sy * sz - f * gyz[i]; + Szz[i] = sz * sz - f * gzz[i]; + } +} + +static void gpu_escalar_sources(int all) +{ + #define D(s) g_buf.slot[s] + gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all); + gpu_fdderivs(D(S_Sphi), D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz), + D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz), 1.0, 1.0, 1.0, all); + + kern_escalar_sources<<>>( + D(S_Sphi), D(S_Spi), + D(S_alpn1), D(S_chin1), + D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz), + D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_chix), D(S_chiy), D(S_chiz), + D(S_Lapx), D(S_Lapy), D(S_Lapz), + D(S_trK), D(S_Gamx), D(S_Gamy), D(S_Gamz), + D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), + D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz), + D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz), + D(S_Sphi_rhs), D(S_Spi_rhs), + D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), + D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz)); + #undef D +} + __global__ void kern_rk4_finalize(const double * __restrict__ f0, double * __restrict__ frhs, double * __restrict__ accum, @@ -2619,6 +2790,27 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx, kern_rk4_finalize_batched<<>>(tables, dT, rk4_stage, chitiny); } +static void gpu_escalar_rk4_finalize_batch(const StepContext &ctx, + size_t all, + double dT, + int rk4_stage, + double chitiny) +{ + Rk4FinalizeTables tables = {}; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + tables.f0_fields[i] = ctx.d_state0[i]; + tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]]; + tables.accum_fields[i] = ctx.d_accum[i]; + } + + dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT); + kern_rk4_finalize_batched<<>>(tables, dT, rk4_stage, chitiny); + kern_rk4_finalize<<>>(ctx.d_state0[24], g_buf.slot[S_Sphi_rhs], + ctx.d_accum[24], dT, rk4_stage); + kern_rk4_finalize<<>>(ctx.d_state0[25], g_buf.slot[S_Spi_rhs], + ctx.d_accum[25], dT, rk4_stage); +} + __global__ __launch_bounds__(128, 4) void kern_copy_patch_boundary_batched(PatchBoundaryTables tables, int touch_xmin, int touch_xmax, @@ -2669,6 +2861,49 @@ static void gpu_copy_patch_boundary_batch(int all, touch_zmin, touch_zmax); } +__global__ __launch_bounds__(128, 4) +void kern_copy_patch_boundary_scalar(const double * __restrict__ src, + double * __restrict__ dst, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int nx = d_gp.ex[0]; + const int ny = d_gp.ex[1]; + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + const bool on_boundary = + (touch_xmin && i0 == 0) || + (touch_xmax && i0 == nx - 1) || + (touch_ymin && j0 == 0) || + (touch_ymax && j0 == ny - 1) || + (touch_zmin && k0 == 0) || + (touch_zmax && k0 == d_gp.ex[2] - 1); + if (on_boundary) dst[tid] = src[tid]; +} + +static void gpu_escalar_copy_patch_boundary_batch(int all, + int touch_xmin, int touch_xmax, + int touch_ymin, int touch_ymax, + int touch_zmin, int touch_zmax) +{ + if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax)) + return; + gpu_copy_patch_boundary_batch(all, touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + kern_copy_patch_boundary_scalar<<>>( + g_buf.slot[S_Sphi], g_buf.slot[S_Sphi_rhs], + touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax); + kern_copy_patch_boundary_scalar<<>>( + g_buf.slot[S_Spi], g_buf.slot[S_Spi_rhs], + touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax); +} + __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, double * __restrict__ gxy, double * __restrict__ gxz, @@ -4773,6 +5008,72 @@ void kern_phase18_constraints( } } +__global__ __launch_bounds__(128, 4) +void kern_escalar_constraint_fr( + const double* __restrict__ chin1, + const double* __restrict__ gupxx, const double* __restrict__ gupxy, + const double* __restrict__ gupxz, const double* __restrict__ gupyy, + const double* __restrict__ gupyz, const double* __restrict__ gupzz, + const double* __restrict__ trK, + const double* __restrict__ Axx, const double* __restrict__ Axy, + const double* __restrict__ Axz, const double* __restrict__ Ayy, + const double* __restrict__ Ayz, const double* __restrict__ Azz, + const double* __restrict__ Rxx, const double* __restrict__ Rxy, + const double* __restrict__ Rxz, const double* __restrict__ Ryy, + const double* __restrict__ Ryz, const double* __restrict__ Rzz, + const double* __restrict__ rho, + const double* __restrict__ Sxx, const double* __restrict__ Sxy, + const double* __restrict__ Sxz, const double* __restrict__ Syy, + const double* __restrict__ Syz, const double* __restrict__ Szz, + const double* __restrict__ Sphi, + double a2, + double* __restrict__ Cons_fR) +{ + const double TWO = 2.0; + const double F2o3 = 2.0 / 3.0; + const double F8 = 8.0; + const double PI_V = 3.14159265358979323846; + const double SQRT3OPI_OVER4 = 0.25 * sqrt(3.0 / PI_V); + + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; + i += blockDim.x * gridDim.x) + { + const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i]; + const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i]; + const double c1 = chin1[i]; + + const double AijAij = + uxx * (uxx * Axx[i] * Axx[i] + uyy * Axy[i] * Axy[i] + uzz * Axz[i] * Axz[i] + + TWO * (uxy * Axx[i] * Axy[i] + uxz * Axx[i] * Axz[i] + uyz * Axy[i] * Axz[i])) + + uyy * (uxx * Axy[i] * Axy[i] + uyy * Ayy[i] * Ayy[i] + uzz * Ayz[i] * Ayz[i] + + TWO * (uxy * Axy[i] * Ayy[i] + uxz * Axy[i] * Ayz[i] + uyz * Ayy[i] * Ayz[i])) + + uzz * (uxx * Axz[i] * Axz[i] + uyy * Ayz[i] * Ayz[i] + uzz * Azz[i] * Azz[i] + + TWO * (uxy * Axz[i] * Ayz[i] + uxz * Axz[i] * Azz[i] + uyz * Ayz[i] * Azz[i])) + + TWO * (uxy * (uxx * Axx[i] * Axy[i] + uyy * Axy[i] * Ayy[i] + uzz * Axz[i] * Ayz[i] + + uxy * (Axx[i] * Ayy[i] + Axy[i] * Axy[i]) + + uxz * (Axx[i] * Ayz[i] + Axz[i] * Axy[i]) + + uyz * (Axy[i] * Ayz[i] + Axz[i] * Ayy[i])) + + uxz * (uxx * Axx[i] * Axz[i] + uyy * Axy[i] * Ayz[i] + uzz * Axz[i] * Azz[i] + + uxy * (Axx[i] * Ayz[i] + Axy[i] * Axz[i]) + + uxz * (Axx[i] * Azz[i] + Axz[i] * Axz[i]) + + uyz * (Axy[i] * Azz[i] + Axz[i] * Ayz[i])) + + uyz * (uxx * Axy[i] * Axz[i] + uyy * Ayy[i] * Ayz[i] + uzz * Ayz[i] * Azz[i] + + uxy * (Axy[i] * Ayz[i] + Ayy[i] * Axz[i]) + + uxz * (Axy[i] * Azz[i] + Ayz[i] * Axz[i]) + + uyz * (Ayy[i] * Azz[i] + Ayz[i] * Ayz[i]))); + + const double R_sc = uxx * Rxx[i] + uyy * Ryy[i] + uzz * Rzz[i] + + TWO * (uxy * Rxy[i] + uxz * Rxz[i] + uyz * Ryz[i]); + const double trS = uxx * Sxx[i] + uyy * Syy[i] + uzz * Szz[i] + + TWO * (uxy * Sxy[i] + uxz * Sxz[i] + uyz * Syz[i]); + const double RR = AijAij - F2o3 * trK[i] * trK[i] + - R_sc * c1 + - F8 * PI_V * (3.0 * rho[i] - trS * c1); + const double fprim = 1.0 + 2.0 * a2 * RR; + Cons_fR[i] = Sphi[i] - SQRT3OPI_OVER4 * log(fprim); + } +} + static void setup_grid_params(int *ex, double *X, double *Y, double *Z, int Symmetry, double eps, int co) @@ -4933,50 +5234,109 @@ static void bind_matter_slots(const StepContext &ctx) } } -static void bind_state_input_slots(const std::array &state) +static void bind_state_input_slots(const std::array &state) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { g_buf.slot[k_state_input_slots[i]] = state[i]; } } -static void bind_state_output_slots(const std::array &state) +static void bind_state_output_slots(const std::array &state) { for (int i = 0; i < BSSN_STATE_COUNT; ++i) { g_buf.slot[k_state_rhs_slots[i]] = state[i]; } } -static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key) +static void bind_escalar_state_input_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) { + g_buf.slot[k_escalar_state_input_slots[i]] = state[i]; + } +} + +static void bind_escalar_state_output_slots(const std::array &state) +{ + for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) { + g_buf.slot[k_escalar_state_rhs_slots[i]] = state[i]; + } +} + +static void upload_escalar_state_inputs(double **state_host, size_t all); + +static bool resident_key_matches_count(const StepContext &ctx, int bank, double **host_key, int state_count) { if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return false; - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) + return false; + for (int i = 0; i < state_count; ++i) { if (!host_key[i] || ctx.resident_host[bank][i] != host_key[i]) return false; } return true; } -static int find_resident_bank(const StepContext &ctx, double **host_key) +static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key) +{ + return resident_key_matches_count(ctx, bank, host_key, BSSN_STATE_COUNT); +} + +static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count) { if (!host_key) return -1; for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { - if (resident_key_matches(ctx, b, host_key)) + if (resident_key_matches_count(ctx, b, host_key, state_count)) return b; } return -1; } -static bool resident_key_usable(double **host_key) +static int find_resident_bank_subset(const StepContext &ctx, + double **host_key, + const int *state_indices, + int subset_count) +{ + if (!host_key || !state_indices || subset_count <= 0) + return -1; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + bool match = true; + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices[i]; + if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY || + !host_key[i] || + ctx.resident_host[b][state_index] != host_key[i]) { + match = false; + break; + } + } + if (match) + return b; + } + return -1; +} + +static int find_resident_bank(const StepContext &ctx, double **host_key) +{ + return find_resident_bank_count(ctx, host_key, BSSN_STATE_COUNT); +} + +static bool resident_key_usable_count(double **host_key, int state_count) { if (!host_key) return false; - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) + return false; + for (int i = 0; i < state_count; ++i) { if (!host_key[i]) return false; } return true; } +static bool resident_key_usable(double **host_key) +{ + return resident_key_usable_count(host_key, BSSN_STATE_COUNT); +} + static void set_resident_host_clean(StepContext &ctx, int bank, bool clean) { if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; @@ -4991,7 +5351,7 @@ static bool resident_host_subset_clean(const StepContext &ctx, if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return false; for (int i = 0; i < subset_count; ++i) { const int state_index = state_indices ? state_indices[i] : i; - if (state_index < 0 || state_index >= BSSN_STATE_COUNT) + if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return false; if (!ctx.resident_host_clean[bank][state_index]) return false; @@ -5008,7 +5368,7 @@ static void mark_resident_host_subset_clean(StepContext &ctx, if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; for (int i = 0; i < subset_count; ++i) { const int state_index = state_indices ? state_indices[i] : i; - if (state_index >= 0 && state_index < BSSN_STATE_COUNT) + if (state_index >= 0 && state_index < BSSN_RESIDENT_STATE_CAPACITY) ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0; } } @@ -5042,17 +5402,18 @@ static void update_state_ready(StepContext &ctx) ctx.state_ready = any_resident_bank_valid(ctx); } -static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) +static void writeback_resident_bank_count(StepContext &ctx, int bank, size_t all, int state_count) { if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; if (!ctx.resident_valid[bank]) return; - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; + for (int i = 0; i < state_count; ++i) { if (!ctx.resident_host[bank][i]) return; } const bool profile = cuda_aux_profile_enabled(); const double t0 = profile ? cuda_profile_now_ms() : 0.0; const size_t bytes = all * sizeof(double); - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int i = 0; i < state_count; ++i) { CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i], ctx.d_resident[bank][i], bytes, cudaMemcpyDeviceToHost)); @@ -5063,11 +5424,16 @@ static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) CudaAuxProfileStats &stats = cuda_aux_profile_stats(); stats.writeback_calls++; stats.writeback_ms += cuda_profile_now_ms() - t0; - stats.writeback_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + stats.writeback_gb += (double)((size_t)state_count * bytes) / 1.0e9; cuda_aux_profile_maybe_log(); } } +static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) +{ + writeback_resident_bank_count(ctx, bank, all, BSSN_STATE_COUNT); +} + static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all) { for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { @@ -5099,15 +5465,51 @@ static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size return best; } -static void assign_resident_key(StepContext &ctx, int bank, double **host_key) +static int choose_escalar_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all) { - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (b != avoid_bank && !ctx.resident_valid[b]) + return b; + } + + int best = -1; + unsigned long long best_age = 0; + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (b == avoid_bank) continue; + if (best < 0 || ctx.resident_age[b] < best_age) { + best = b; + best_age = ctx.resident_age[b]; + } + } + if (best < 0) best = 0; + writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT); + ctx.resident_valid[best] = false; + ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); + ctx.resident_age[best] = 0; + if (ctx.current_bank == best) { + ctx.current_bank = -1; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_curr.fill(nullptr); + } + update_state_ready(ctx); + return best; +} + +static void assign_resident_key_count(StepContext &ctx, int bank, double **host_key, int state_count) +{ + for (int i = 0; i < state_count; ++i) { ctx.resident_host[bank][i] = host_key[i]; } set_resident_host_clean(ctx, bank, false); ctx.resident_age[bank] = ++ctx.resident_clock; } +static void assign_resident_key(StepContext &ctx, int bank, double **host_key) +{ + assign_resident_key_count(ctx, bank, host_key, BSSN_STATE_COUNT); +} + static int ensure_resident_bank(StepContext &ctx, double **host_key, size_t all, @@ -5147,6 +5549,47 @@ static int ensure_resident_bank(StepContext &ctx, return bank; } +static int ensure_escalar_resident_bank(StepContext &ctx, + double **host_key, + size_t all, + bool upload_if_missing, + int avoid_bank = -1) +{ + if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) { + if (ctx.current_bank >= 0) + return ctx.current_bank; + return 0; + } + + int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT); + if (bank >= 0) { + ctx.resident_age[bank] = ++ctx.resident_clock; + if (!ctx.resident_valid[bank] && upload_if_missing) { + bind_escalar_state_input_slots(ctx.d_resident[bank]); + upload_escalar_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } + return bank; + } + + bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_bank, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT); + if (upload_if_missing) { + bind_escalar_state_input_slots(ctx.d_resident[bank]); + upload_escalar_state_inputs(host_key, all); + CUDA_CHECK(cudaDeviceSynchronize()); + ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); + } else { + ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); + } + update_state_ready(ctx); + return bank; +} + static int reserve_resident_output_bank(StepContext &ctx, double **host_key, size_t all, @@ -5167,6 +5610,62 @@ static int reserve_resident_output_bank(StepContext &ctx, return bank; } +static int reserve_escalar_resident_output_bank(StepContext &ctx, + double **host_key, + size_t all, + int input_bank) +{ + if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_ESCALAR_STATE_COUNT)) + return input_bank; + + int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT); + if (bank < 0) + bank = choose_escalar_resident_bank_for_reuse(ctx, input_bank, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + +static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c); + +static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx, + double **host_key, + size_t all, + int avoid_a, + int avoid_b, + int avoid_c) +{ + if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) + return (ctx.current_bank >= 0) ? ctx.current_bank : 0; + if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_ESCALAR_STATE_COUNT)) + return avoid_a; + if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_ESCALAR_STATE_COUNT)) + return avoid_b; + if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_ESCALAR_STATE_COUNT)) + return avoid_c; + + int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT); + if (bank < 0) { + for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { + if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) { + bank = b; + break; + } + } + } + if (bank < 0) + bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all); + assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT); + ctx.resident_valid[bank] = false; + ctx.resident_age[bank] = ++ctx.resident_clock; + update_state_ready(ctx); + return bank; +} + static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c) { return bank == avoid_a || bank == avoid_b || bank == avoid_c; @@ -5256,7 +5755,7 @@ static int active_or_keyed_bank(StepContext &ctx, return 0; } -static void launch_rhs_pipeline(int all, double eps, int co) +static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false) { const double SYM = 1.0; const double ANTI = -1.0; @@ -5336,6 +5835,11 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz)); + if (compute_escalar) { + gpu_escalar_sources(all); + gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all); + } + if (co == 0) { kern_phase3_gamma_constraint<<>>( D(S_Gamx), D(S_Gamy), D(S_Gamz), @@ -5367,7 +5871,9 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz), D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), - D(S_Kx), D(S_Ky), D(S_Kz), + compute_escalar ? D(S_trK_x) : D(S_Kx), + compute_escalar ? D(S_trK_y) : D(S_Ky), + compute_escalar ? D(S_trK_z) : D(S_Kz), D(S_Sx), D(S_Sy), D(S_Sz), D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), @@ -5552,7 +6058,9 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz), D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), - D(S_Kx), D(S_Ky), D(S_Kz), + compute_escalar ? D(S_trK_x) : D(S_Kx), + compute_escalar ? D(S_trK_y) : D(S_Ky), + compute_escalar ? D(S_trK_z) : D(S_Kz), D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz), D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz), D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz), @@ -5585,6 +6093,26 @@ static void download_state_outputs(double **state_host_out, size_t all) } } +static void download_escalar_state_outputs(double **state_host_out, size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], + g_buf.slot[k_escalar_state_rhs_slots[i]], + bytes, cudaMemcpyDeviceToHost)); + } + CUDA_CHECK(cudaDeviceSynchronize()); +} + +static void upload_escalar_state_inputs(double **state_host, size_t all) +{ + const size_t bytes = all * sizeof(double); + for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) { + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_escalar_state_input_slots[i]], + state_host[i], bytes, cudaMemcpyHostToDevice)); + } +} + static void download_constraint_outputs(double **constraint_host_out, size_t all) { const size_t bytes = all * sizeof(double); @@ -5596,6 +6124,186 @@ static void download_constraint_outputs(double **constraint_host_out, size_t all } } +extern "C" +int bssn_escalar_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny) +{ + (void)T; + if (RK4 < 0 || RK4 > 3) return 1; + + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + + double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0; +#ifdef fortran1 + set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); +#endif +#ifdef fortran2 + SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); +#endif +#ifdef fortran3 + set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); +#endif + if (fabs(escalar_a2 - 3.0) > 1.0e-12 && g_dispatch.my_rank == 0) { + fprintf(stderr, "CUDA BSSN-EScalar currently supports FR a2=3 for EScalar_CC=2/3; got %.17g\n", + escalar_a2); + return 1; + } + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + int touch_xmin = 0, touch_xmax = 0; + int touch_ymin = 0, touch_ymax = 0; + int touch_zmin = 0, touch_zmax = 0; + + setup_grid_params(ex, X, Y, Z, Symmetry, eps, co); + if (Lev > 0) { + compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } + + StepContext &ctx = ensure_step_ctx(block_tag, all); + const bool use_resident_state = (keep_resident_state != 0); + int input_bank = -1; + int output_bank = -1; + if (use_resident_state) { + input_bank = ensure_escalar_resident_bank(ctx, state_host_in, all, true); + output_bank = reserve_escalar_resident_output_bank(ctx, state_host_out, all, input_bank); + mark_resident_current_bank(ctx, input_bank); + mark_resident_next_bank(ctx, output_bank); + bind_escalar_state_input_slots(ctx.d_resident[input_bank]); + bind_escalar_state_output_slots(ctx.d_resident[output_bank]); + } else { + upload_escalar_state_inputs(state_host_in, all); + } + + if (apply_enforce_ga) { + kern_enforce_ga_cuda<<>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz], + g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], + g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], + g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + } + + if (RK4 == 0) { + if (use_resident_state) { + CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank], + (size_t)BSSN_ESCALAR_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice)); + } else { + CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(ctx.d_state0[24], g_buf.slot[S_Sphi], + bytes, cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpy(ctx.d_state0[25], g_buf.slot[S_Spi], + bytes, cudaMemcpyDeviceToDevice)); + } + } + + launch_rhs_pipeline((int)all, eps, co, true); + + if (apply_bam_bc) { + for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) { + gpu_sommerfeld_routbam(g_buf.slot[k_escalar_state_input_slots[i]], + g_buf.slot[k_escalar_state_rhs_slots[i]], + propspeed[i], + soa_flat[3 * i + 0], + soa_flat[3 * i + 1], + soa_flat[3 * i + 2], + X, Y, Z, bbox, Symmetry); + } + } + + gpu_escalar_rk4_finalize_batch(ctx, all, dT, RK4, chitiny); + if (Lev > 0) { + gpu_escalar_copy_patch_boundary_batch((int)all, + touch_xmin, touch_xmax, + touch_ymin, touch_ymax, + touch_zmin, touch_zmax); + } + if (use_resident_state) { + ctx.resident_valid[output_bank] = true; + ctx.resident_age[output_bank] = ++ctx.resident_clock; + set_resident_host_clean(ctx, output_bank, false); + mark_resident_current_bank(ctx, output_bank); + update_state_ready(ctx); + } else { + download_escalar_state_outputs(state_host_out, all); + } + if (RK4 == 3) + ctx.matter_ready = false; + return 0; +} + +extern "C" +int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **constraint_host_out, + int &Symmetry, + int &Lev, + double &eps) +{ + if (!state_host_in || !constraint_host_out) return 1; + + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + + double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0; +#ifdef fortran1 + set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); +#endif +#ifdef fortran2 + SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); +#endif +#ifdef fortran3 + set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2); +#endif + + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const size_t bytes = all * sizeof(double); + setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0); + upload_escalar_state_inputs(state_host_in, all); + launch_rhs_pipeline((int)all, eps, 0, true); + + #define D(s) g_buf.slot[s] + kern_escalar_constraint_fr<<>>( + D(S_chin1), + D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz), + D(S_trK), + D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz), + D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz), + D(S_rho), + D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz), + D(S_Sphi), + escalar_a2, + D(S_f_arr)); + #undef D + + download_constraint_outputs(constraint_host_out, all); + CUDA_CHECK(cudaMemcpy(constraint_host_out[7], g_buf.slot[S_f_arr], + bytes, cudaMemcpyDeviceToHost)); + (void)Lev; + return 0; +} + __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1, const double * __restrict__ src2, const double * __restrict__ src3, @@ -6222,7 +6930,7 @@ static void copy_state_region_cuda(void *block_tag, cudaMemcpyKind kind, double **state_host_key = nullptr) { - if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; + if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; const size_t pitch = (size_t)ex[0] * sizeof(double); @@ -6264,7 +6972,7 @@ static void copy_state_region_packed_cuda(void *block_tag, cudaMemcpyKind kind, double **state_host_key = nullptr) { - if (state_index < 0 || state_index >= BSSN_STATE_COUNT) return; + if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; const size_t src_pitch = (size_t)ex[0] * sizeof(double); @@ -6310,7 +7018,7 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, cudaMemcpyKind kind, double **state_host_key = nullptr) { - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); @@ -6353,12 +7061,12 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, } } -static void download_resident_state(void *block_tag, int *ex, double **state_host_out) +static void download_resident_state_count(void *block_tag, int *ex, double **state_host_out, int state_count) { const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); - int bank = find_resident_bank(ctx, state_host_out); + int bank = find_resident_bank_count(ctx, state_host_out, state_count); if (bank < 0) { bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false); } @@ -6371,9 +7079,9 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1; } if (direct_download) { - if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + if (resident_host_subset_clean(ctx, bank, state_count, nullptr)) return; - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int i = 0; i < state_count; ++i) { CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i], bytes, cudaMemcpyDeviceToHost)); } @@ -6383,16 +7091,16 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos CudaProfileStats &stats = cuda_profile_stats(); stats.resident_download_calls++; stats.resident_download_ms += cuda_profile_now_ms() - t0; - stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9; } return; } - if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + if (resident_host_subset_clean(ctx, bank, state_count, nullptr)) return; CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank], - (size_t)BSSN_STATE_COUNT * bytes, + (size_t)state_count * bytes, cudaMemcpyDeviceToHost)); - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int i = 0; i < state_count; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } set_resident_host_clean(ctx, bank, true); @@ -6400,24 +7108,44 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos CudaProfileStats &stats = cuda_profile_stats(); stats.resident_download_calls++; stats.resident_download_ms += cuda_profile_now_ms() - t0; - stats.resident_download_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9; } } +static void download_resident_state(void *block_tag, int *ex, double **state_host_out) +{ + download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT); +} + +static bool download_resident_state_count_if_present(void *block_tag, + int *ex, + double **state_host_out, + int state_count); + static bool download_resident_state_if_present(void *block_tag, int *ex, double **state_host_out) { + return download_resident_state_count_if_present(block_tag, ex, state_host_out, BSSN_STATE_COUNT); +} + +static bool download_resident_state_count_if_present(void *block_tag, + int *ex, + double **state_host_out, + int state_count) +{ + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) + return false; auto it = g_step_ctx.find(block_tag); if (it == g_step_ctx.end()) return false; StepContext &ctx = it->second; - const int bank = find_resident_bank(ctx, state_host_out); + const int bank = find_resident_bank_count(ctx, state_host_out, state_count); if (bank < 0 || !ctx.resident_valid[bank]) return false; const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); mark_resident_current_bank(ctx, bank); - if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + if (resident_host_subset_clean(ctx, bank, state_count, nullptr)) return true; static int direct_download = -1; @@ -6426,16 +7154,16 @@ static bool download_resident_state_if_present(void *block_tag, int *ex, double direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1; } if (direct_download) { - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int i = 0; i < state_count; ++i) { CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i], bytes, cudaMemcpyDeviceToHost)); } CUDA_CHECK(cudaDeviceSynchronize()); } else { CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank], - (size_t)BSSN_STATE_COUNT * bytes, + (size_t)state_count * bytes, cudaMemcpyDeviceToHost)); - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + for (int i = 0; i < state_count; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } } @@ -6454,17 +7182,17 @@ static void copy_state_subset(void *block_tag, const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const size_t bytes = all * sizeof(double); StepContext &ctx = ensure_step_ctx(block_tag, all); - double **full_key = (subset_count == BSSN_STATE_COUNT) ? state_host : nullptr; + double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr; const int bank = active_or_keyed_bank(ctx, full_key, all, kind == cudaMemcpyHostToDevice); double *base_mem = ctx.d_resident_mem[bank]; - int active_state_indices[BSSN_STATE_COUNT]; - double *active_state_host[BSSN_STATE_COUNT]; + int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY]; + double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY]; int active_count = 0; for (int i = 0; i < subset_count; ++i) { const int state_index = state_indices[i]; - if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue; + if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) continue; if (kind == cudaMemcpyDeviceToHost && ctx.resident_host_clean[bank][state_index]) continue; @@ -7095,6 +7823,31 @@ int bssn_cuda_download_resident_state(void *block_tag, return 0; } +extern "C" +int bssn_escalar_cuda_download_resident_state(void *block_tag, + int *ex, + double **state_host_out) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + download_resident_state_count(block_tag, ex, state_host_out, BSSN_ESCALAR_STATE_COUNT); + return 0; +} + +extern "C" +int bssn_cuda_download_resident_state_count_if_present(void *block_tag, + int *ex, + double **state_host_out, + int state_count) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) + return 1; + download_resident_state_count_if_present(block_tag, ex, state_host_out, state_count); + return 0; +} + extern "C" int bssn_cuda_download_resident_state_if_present(void *block_tag, int *ex, @@ -7150,6 +7903,7 @@ int bssn_cuda_interp_state_point3(void *block_tag, double pz, int ordn, int symmetry, + double **state_host_key, const double *soa3, double *out3) { @@ -7157,9 +7911,9 @@ int bssn_cuda_interp_state_point3(void *block_tag, CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); if (!block_tag || !ex || !out3 || !soa3) return 1; - if (state0 < 0 || state0 >= BSSN_STATE_COUNT || - state1 < 0 || state1 >= BSSN_STATE_COUNT || - state2 < 0 || state2 >= BSSN_STATE_COUNT) + if (state0 < 0 || state0 >= BSSN_RESIDENT_STATE_CAPACITY || + state1 < 0 || state1 >= BSSN_RESIDENT_STATE_CAPACITY || + state2 < 0 || state2 >= BSSN_RESIDENT_STATE_CAPACITY) return 1; if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 || ordn <= 0 || ordn > 8 || @@ -7168,7 +7922,8 @@ int bssn_cuda_interp_state_point3(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; - const int bank = active_or_keyed_bank(ctx, nullptr, all, false); + const int interp_states[3] = {state0, state1, state2}; + const int bank = find_resident_bank_subset(ctx, state_host_key, interp_states, 3); if (bank < 0 || !ctx.resident_valid[bank]) return 1; @@ -7354,7 +8109,7 @@ static void copy_state_device_batch(void *block_tag, int pack_not_unpack, double **state_host_key = nullptr) { - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; if (sx <= 0 || sy <= 0 || sz <= 0) return; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); @@ -7395,7 +8150,7 @@ static void copy_state_device_segments(void *block_tag, int pack_not_unpack, double **state_host_key = nullptr) { - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; if (segment_count <= 0 || !segment_meta) return; int max_region_all = 0; @@ -7445,7 +8200,7 @@ static void restrict_state_device_segments(void *block_tag, double **state_host_key = nullptr, const double *state_soa = nullptr) { - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; if (segment_count <= 0 || !segment_meta || !device_buffer) return; int max_region_all = 0; @@ -7484,7 +8239,7 @@ static void prolong_state_device_segments(void *block_tag, double **state_host_key = nullptr, const double *state_soa = nullptr) { - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return; if (segment_count <= 0 || !segment_meta || !device_buffer) return; int max_region_all = 0; @@ -7712,7 +8467,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const int region_all = sx * sy * sz; @@ -7739,7 +8494,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; @@ -7767,7 +8522,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const int region_all = sx * sy * sz; @@ -7796,7 +8551,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); - if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1; if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; @@ -7845,6 +8600,7 @@ int bssn_cuda_upload_state_subset(void *block_tag, extern "C" int bssn_cuda_prepare_inter_time_level(void *block_tag, int *ex, + int state_count, double **src1_host_key, double **src2_host_key, double **src3_host_key, @@ -7856,12 +8612,14 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); const bool profile = cuda_aux_profile_enabled(); const double t0 = profile ? cuda_profile_now_ms() : 0.0; - if (source_count != 2 && source_count != 3) return 1; - if (!resident_key_usable(src1_host_key) || - !resident_key_usable(src2_host_key) || - !resident_key_usable(dst_host_key)) + if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT) return 1; - if (source_count == 3 && !resident_key_usable(src3_host_key)) + if (source_count != 2 && source_count != 3) return 1; + if (!resident_key_usable_count(src1_host_key, state_count) || + !resident_key_usable_count(src2_host_key, state_count) || + !resident_key_usable_count(dst_host_key, state_count)) + return 1; + if (source_count == 3 && !resident_key_usable_count(src3_host_key, state_count)) return 1; double c1 = 0.0, c2 = 0.0, c3 = 0.0; @@ -7887,21 +8645,32 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, const size_t all = (size_t)ex[0] * ex[1] * ex[2]; StepContext &ctx = ensure_step_ctx(block_tag, all); - const int src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true); - const int src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank); - const int src3_bank = (source_count == 3) - ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank) - : -1; - const int dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all, - src1_bank, src2_bank, src3_bank); + int src1_bank, src2_bank, src3_bank, dst_bank; + if (state_count == BSSN_ESCALAR_STATE_COUNT) { + src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true); + src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank); + src3_bank = (source_count == 3) + ? ensure_escalar_resident_bank(ctx, src3_host_key, all, true, src1_bank) + : -1; + dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all, + src1_bank, src2_bank, src3_bank); + } else { + src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true); + src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank); + src3_bank = (source_count == 3) + ? ensure_resident_bank(ctx, src3_host_key, all, true, src1_bank) + : -1; + dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all, + src1_bank, src2_bank, src3_bank); + } - dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT); + dim3 launch_grid((unsigned int)grid(all), (unsigned int)state_count); kern_prepare_inter_time_level<<>>( ctx.d_resident_mem[src1_bank], ctx.d_resident_mem[src2_bank], (source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr, ctx.d_resident_mem[dst_bank], - c1, c2, c3, BSSN_STATE_COUNT, (int)all); + c1, c2, c3, state_count, (int)all); if (profile) cuda_profile_sync(); ctx.resident_valid[dst_bank] = true; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 41e92f8..38f945a 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -7,6 +7,7 @@ extern "C" { enum { BSSN_CUDA_STATE_COUNT = 24, + BSSN_ESCALAR_CUDA_STATE_COUNT = 26, BSSN_CUDA_MATTER_COUNT = 10 }; @@ -55,6 +56,32 @@ int bssn_cuda_rk4_substep(void *block_tag, int &apply_enforce_ga, double &chitiny); +int bssn_escalar_cuda_rk4_substep(void *block_tag, + int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **state_host_out, + const double *propspeed, + const double *soa_flat, + const double *bbox, + double &dT, + double &T, + int &RK4, + int &apply_bam_bc, + int &Symmetry, + int &Lev, + double &eps, + int &co, + int &keep_resident_state, + int &apply_enforce_ga, + double &chitiny); + +int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z, + double **state_host_in, + double **constraint_host_out, + int &Symmetry, + int &Lev, + double &eps); + int bssn_cuda_copy_state_region_to_host(void *block_tag, int state_index, double *host_state, @@ -73,6 +100,15 @@ int bssn_cuda_download_resident_state(void *block_tag, int *ex, double **state_host_out); +int bssn_escalar_cuda_download_resident_state(void *block_tag, + int *ex, + double **state_host_out); + +int bssn_cuda_download_resident_state_count_if_present(void *block_tag, + int *ex, + double **state_host_out, + int state_count); + int bssn_cuda_download_resident_state_if_present(void *block_tag, int *ex, double **state_host_out); @@ -103,6 +139,7 @@ int bssn_cuda_interp_state_point3(void *block_tag, double pz, int ordn, int symmetry, + double **state_host_key, const double *soa3, double *out3); @@ -302,6 +339,7 @@ int bssn_cuda_upload_state_subset(void *block_tag, int bssn_cuda_prepare_inter_time_level(void *block_tag, int *ex, + int state_count, double **src1_host_key, double **src2_host_key, double **src3_host_key, diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 84fd4c0..93cf81c 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -35,7 +35,6 @@ f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ endif TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(TP_PROFDATA) \ -Dfortran3 -Dnewc $(MKL_INC) else ## NVHPC defaults: mpicc/mpicxx/mpifort wrappers diff --git a/makefile_and_run.py b/makefile_and_run.py index d4ce33b..f539764 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -146,6 +146,7 @@ def _gpu_runtime_env(): "AMSS_CUDA_AWARE_MPI": "1", "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1", "AMSS_CUDA_KEEP_ALL_LEVELS": "1", + "AMSS_CUDA_AMR_HOST_STAGED": "1", "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1", "AMSS_CUDA_AMR_RESTRICT_BATCH": "0", "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0", @@ -277,6 +278,7 @@ def run_ABE(): print(f" AMSS_CUDA_AWARE_MPI={mpi_env.get('AMSS_CUDA_AWARE_MPI', '')}") print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}") print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}") + print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}") print(f" AMSS_CUDA_DEVICE_SEGMENT_BATCH={mpi_env.get('AMSS_CUDA_DEVICE_SEGMENT_BATCH', '')}")