From 83afaf19ceaac103bc79dc3800ba6fdf1f256047 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 7 May 2026 13:04:46 +0800 Subject: [PATCH] Skip zero EM resident downloads --- AMSS_NCKU_source/bssnEM_class.C | 124 ++++++++++++++++++++------------ makefile_and_run.py | 2 + 2 files changed, 81 insertions(+), 45 deletions(-) diff --git a/AMSS_NCKU_source/bssnEM_class.C b/AMSS_NCKU_source/bssnEM_class.C index 763cf0f..1adb720 100644 --- a/AMSS_NCKU_source/bssnEM_class.C +++ b/AMSS_NCKU_source/bssnEM_class.C @@ -87,7 +87,18 @@ bool bssn_em_zero_analysis_fastpath_enabled() return enabled != 0; } -bool bssn_em_analysis_zero_fastpath_ready(MyList *PatL, +bool bssn_em_zero_resident_download_fastpath_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool bssn_em_resident_zero_fastpath_ready(MyList *PatL, #ifdef WithShell ShellPatch *shell, #else @@ -95,8 +106,6 @@ bool bssn_em_analysis_zero_fastpath_ready(MyList *PatL, #endif int rank) { - if (!bssn_em_zero_analysis_fastpath_enabled()) - return false; int local_ok = 1; int local_seen = 0; MyList *Pp = PatL; @@ -149,6 +158,19 @@ bool bssn_em_analysis_zero_fastpath_ready(MyList *PatL, return global_seen && global_ok; } +bool bssn_em_analysis_zero_fastpath_ready(MyList *PatL, +#ifdef WithShell + ShellPatch *shell, +#else + ShellPatch *shell, +#endif + int rank) +{ + if (!bssn_em_zero_analysis_fastpath_enabled()) + return false; + return bssn_em_resident_zero_fastpath_ready(PatL, shell, rank); +} + void zero_em_analysis_outputs(MyList *PatL, #ifdef WithShell ShellPatch *shell, @@ -1660,29 +1682,29 @@ void bssnEM_class::Step(int lev, int YN) } #endif - if (em_step_timing) - em_t0 = MPI_Wtime(); - Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); - if (em_step_timing) - em_t_predictor_sync += MPI_Wtime() - em_t0; - -#ifdef WithShell - if (lev == 0) - { - clock_t prev_clock, curr_clock; - if (myrank == 0) - curr_clock = clock(); - SH->Synch(SynchList_pre, Symmetry); - if (myrank == 0) - { - prev_clock = curr_clock; - curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) - << " seconds! " << endl; - } - } -#endif + if (em_step_timing) + em_t0 = MPI_Wtime(); + Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); + if (em_step_timing) + em_t_predictor_sync += MPI_Wtime() - em_t0; + +#ifdef WithShell + if (lev == 0) + { + clock_t prev_clock, curr_clock; + if (myrank == 0) + curr_clock = clock(); + SH->Synch(SynchList_pre, Symmetry); + if (myrank == 0) + { + prev_clock = curr_clock; + curr_clock = clock(); + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + << " seconds! " << endl; + } + } +#endif // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -2198,24 +2220,24 @@ void bssnEM_class::Step(int lev, int YN) Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); if (em_step_timing) em_t_corrector_sync += MPI_Wtime() - em_t0; - -#ifdef WithShell - if (lev == 0) - { - clock_t prev_clock, curr_clock; - if (myrank == 0) - curr_clock = clock(); - SH->Synch(SynchList_cor, Symmetry); - if (myrank == 0) - { - prev_clock = curr_clock; - curr_clock = clock(); - cout << " Shell stuff synchronization used " - << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) - << " seconds! " << endl; - } - } -#endif + +#ifdef WithShell + if (lev == 0) + { + clock_t prev_clock, curr_clock; + if (myrank == 0) + curr_clock = clock(); + SH->Synch(SynchList_cor, Symmetry); + if (myrank == 0) + { + prev_clock = curr_clock; + curr_clock = clock(); + cout << " Shell stuff synchronization used " + << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) + << " seconds! " << endl; + } + } +#endif // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -2310,7 +2332,19 @@ void bssnEM_class::Step(int lev, int YN) { if (em_step_timing) em_t0 = MPI_Wtime(); - if (!bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev)) + const bool needs_resident_download = + !bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev); + const bool skip_zero_resident_download = + needs_resident_download && + bssn_em_zero_resident_download_fastpath_enabled() && + bssn_em_resident_zero_fastpath_ready(GH->PatL[lev], +#ifdef WithShell + 0, +#else + 0, +#endif + myrank); + if (needs_resident_download && !skip_zero_resident_download) bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); if (em_step_timing) em_t_resident += MPI_Wtime() - em_t0; diff --git a/makefile_and_run.py b/makefile_and_run.py index 494ea56..dfd6309 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -153,6 +153,7 @@ def _gpu_runtime_env(): "AMSS_CUDA_EM_CACHE_SOURCES": "1", "AMSS_CUDA_EM_ZERO_FASTPATH": "1", "AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1", + "AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH": "1", "AMSS_CUDA_AMR_HOST_STAGED": "1", "AMSS_CUDA_AMR_RESTRICT_DEVICE": "0", "AMSS_CUDA_AMR_RESTRICT_BATCH": "0", @@ -293,6 +294,7 @@ def run_ABE(): print(f" AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}") print(f" AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}") print(f" AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}") + print(f" AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH={mpi_env.get('AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH', '')}") print(f" AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}") print(f" AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")