From 0cf58176d91cb9118a3fa6f77cd86b38bcae9e07 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sat, 25 Apr 2026 01:41:55 +0800 Subject: [PATCH] Add safe BSSN-EScalar kernel and transfer toggles --- AMSS_NCKU_source/bssnEScalar_class.C | 109 ++--- AMSS_NCKU_source/bssnEScalar_class.h | 2 +- AMSS_NCKU_source/bssn_class.C | 606 ++++++++++++--------------- AMSS_NCKU_source/bssn_class.h | 45 +- AMSS_NCKU_source/makefile | 19 +- AMSS_NCKU_source/makefile.inc | 11 + 6 files changed, 379 insertions(+), 413 deletions(-) diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index ad07ae3..599bae4 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -23,8 +23,14 @@ using namespace std; #include "rungekutta4_rout.h" #include "sommerfeld_rout.h" #include "getnp4.h" -#include "shellfunctions.h" -#include "parameters.h" +#include "shellfunctions.h" +#include "parameters.h" + +#if BSSN_USE_ESCALAR_C_KERNEL +#define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar_c +#else +#define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar +#endif #ifdef With_AHF #include "derivatives.h" @@ -169,13 +175,7 @@ void bssnEScalar_class::Initialize() Setup_Black_Hole_position(); } - // BSSN-EScalar currently uses the uncached communication fallback paths. - sync_cache_pre = 0; - sync_cache_cor = 0; - sync_cache_rp_coarse = 0; - sync_cache_rp_fine = 0; - sync_cache_restrict = 0; - sync_cache_outbd = 0; + setup_transfer_caches(); } //================================================================================================ @@ -345,11 +345,13 @@ void bssnEScalar_class::Read_Ansorg() } inf.close(); } - int order = 6; - Ansorg read_ansorg("Ansorg.psid", order); - // set initial data - for (int lev = 0; lev < GH->levels; lev++) - { + int order = 6; + Ansorg read_ansorg("Ansorg.psid", order); + if (myrank == 0) + cout << "[debug] Read_Ansorg: Ansorg object ready" << endl; + // set initial data + for (int lev = 0; lev < GH->levels; lev++) + { MyList *Pp = GH->PatL[lev]; while (Pp) { @@ -381,12 +383,14 @@ void bssnEScalar_class::Read_Ansorg() if (BL == Pp->data->ble) break; BL = BL->next; - } - Pp = Pp->next; - } - } -#ifdef WithShell - // ShellPatch part + } + Pp = Pp->next; + } + if (myrank == 0) + cout << "[debug] Read_Ansorg: finished level " << lev << " patch init" << endl; + } +#ifdef WithShell + // ShellPatch part MyList *Pp = SH->PatL; while (Pp) { @@ -423,15 +427,19 @@ void bssnEScalar_class::Read_Ansorg() if (BL == Pp->data->ble) break; BL = BL->next; - } - Pp = Pp->next; - } -#endif + } + Pp = Pp->next; + } + if (myrank == 0) + cout << "[debug] Read_Ansorg: finished shell init" << endl; +#endif delete[] Porg_here; delete[] pmom_local; delete[] spin_local; delete[] mass_local; + if (myrank == 0) + cout << "[debug] Read_Ansorg: finished local cleanup" << endl; // dump read_in initial data // for(int lev=0;levlevels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT); } @@ -762,7 +770,7 @@ void bssnEScalar_class::Step(int lev, int YN) cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); #endif - if (f_compute_rhs_bssn_escalar_c(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], @@ -1016,11 +1024,12 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); + Parallel::AsyncSyncState async_pre; + sync_predictor_start(lev, SynchList_pre, async_pre); -#ifdef WithShell - if (lev == 0) - { +#ifdef WithShell + if (lev == 0) + { clock_t prev_clock, curr_clock; if (myrank == 0) curr_clock = clock(); @@ -1032,9 +1041,10 @@ void bssnEScalar_class::Step(int lev, int YN) cout << " Shell stuff synchronization used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; - } - } -#endif + } + } +#endif + sync_predictor_finish(lev, async_pre, SynchList_pre); // for black hole position if (BH_num > 0 && lev == GH->levels - 1) @@ -1104,7 +1114,7 @@ void bssnEScalar_class::Step(int lev, int YN) cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); #endif - if (f_compute_rhs_bssn_escalar_c(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], @@ -1372,11 +1382,12 @@ void bssnEScalar_class::Step(int lev, int YN) } #endif - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); + Parallel::AsyncSyncState async_cor; + sync_corrector_start(lev, SynchList_cor, async_cor); -#ifdef WithShell - if (lev == 0) - { +#ifdef WithShell + if (lev == 0) + { clock_t prev_clock, curr_clock; if (myrank == 0) curr_clock = clock(); @@ -1388,9 +1399,10 @@ void bssnEScalar_class::Step(int lev, int YN) cout << " Shell stuff synchronization used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl; - } - } -#endif + } + } +#endif + sync_corrector_finish(lev, async_cor, SynchList_cor); // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -1858,11 +1870,14 @@ void bssnEScalar_class::AnalysisStuff_EScalar(int lev, double dT_lev) //================================================================================================ -void bssnEScalar_class::Interp_Constraint() -{ - // we do not support a_lev != 0 yet. - if (a_lev > 0) - return; +void bssnEScalar_class::Interp_Constraint(bool infg) +{ + if (!infg) + return; + + // we do not support a_lev != 0 yet. + if (a_lev > 0) + return; for (int lev = 0; lev < GH->levels; lev++) { @@ -1881,7 +1896,7 @@ void bssnEScalar_class::Interp_Constraint() if (myrank == cg->rank) { if (lev > 0) - f_compute_rhs_bssn_escalar_c(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], @@ -2101,7 +2116,7 @@ void bssnEScalar_class::Constraint_Out() if (myrank == cg->rank) { if (lev > 0) - f_compute_rhs_bssn_escalar_c(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], diff --git a/AMSS_NCKU_source/bssnEScalar_class.h b/AMSS_NCKU_source/bssnEScalar_class.h index 3e26005..cabfdb1 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.h +++ b/AMSS_NCKU_source/bssnEScalar_class.h @@ -51,7 +51,7 @@ public: void Compute_Psi4(int lev); void Step(int lev, int YN); void AnalysisStuff_EScalar(int lev, double dT_lev); - void Interp_Constraint(); + void Interp_Constraint(bool infg); void Constraint_Out(); protected: diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 5b0645f..23a074c 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -283,7 +283,7 @@ namespace rhs_kernel_timing_report bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei, int Symmetryi, int checkruni, char *checkfilenamei, - double numepssi, double numepsbi, double numepshi, + double numepssi, double numepsbi, double numepshi, int a_levi, int maxli, int decni, double maxrexi, double drexi) : Courant(Couranti), StartTime(StartTimei), TotalTime(TotalTimei), DumpTime(DumpTimei), d2DumpTime(d2DumpTimei), CheckTime(CheckTimei), AnasTime(AnasTimei), @@ -1008,21 +1008,7 @@ void bssn_class::Initialize() Setup_Black_Hole_position(); } - // BSSN-EScalar uses the uncached communication fallback paths. - sync_cache_pre = 0; - sync_cache_cor = 0; - sync_cache_rp_coarse = 0; - sync_cache_rp_fine = 0; - sync_cache_restrict = 0; - sync_cache_outbd = 0; -#if (ABEtype != 1) - sync_cache_pre = new Parallel::SyncCache[GH->levels]; - sync_cache_cor = new Parallel::SyncCache[GH->levels]; - sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; - sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; - sync_cache_restrict = new Parallel::SyncCache[GH->levels]; - sync_cache_outbd = new Parallel::SyncCache[GH->levels]; -#endif + setup_transfer_caches(); } //================================================================================================ @@ -1037,15 +1023,8 @@ void bssn_class::Initialize() bssn_class::~bssn_class() { -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] begin" << endl; - cout.flush(); - } -#endif #ifdef With_AHF - AHList->clearList(); + AHList->clearList(); AHDList->clearList(); GaugeList->clearList(); if (lastahdumpid) @@ -1078,13 +1057,6 @@ bssn_class::~bssn_class() ConstraintList->clearList(); delete[] ConstraintRefreshLevels; -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] lists cleared" << endl; - cout.flush(); - } -#endif delete phio; delete trKo; @@ -1257,20 +1229,13 @@ bssn_class::~bssn_class() delete Cons_Ham; delete Cons_Px; - delete Cons_Py; - delete Cons_Pz; - delete Cons_Gx; + delete Cons_Py; + delete Cons_Pz; + delete Cons_Gx; delete Cons_Gy; delete Cons_Gz; -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] core vars freed" << endl; - cout.flush(); - } -#endif - -#ifdef Point_Psi4 + +#ifdef Point_Psi4 delete phix; delete phiy; delete phiz; @@ -1296,78 +1261,17 @@ bssn_class::~bssn_class() delete Azzy; delete Azzz; #endif - - // Destroy sync caches before GH - if (sync_cache_pre) - { -#if (ABEtype != 1) - for (int i = 0; i < GH->levels; i++) - sync_cache_pre[i].destroy(); -#endif - delete[] sync_cache_pre; - } - if (sync_cache_cor) - { -#if (ABEtype != 1) - for (int i = 0; i < GH->levels; i++) - sync_cache_cor[i].destroy(); -#endif - delete[] sync_cache_cor; - } - if (sync_cache_rp_coarse) - { -#if (ABEtype != 1) - for (int i = 0; i < GH->levels; i++) - sync_cache_rp_coarse[i].destroy(); -#endif - delete[] sync_cache_rp_coarse; - } - if (sync_cache_rp_fine) - { -#if (ABEtype != 1) - for (int i = 0; i < GH->levels; i++) - sync_cache_rp_fine[i].destroy(); -#endif - delete[] sync_cache_rp_fine; - } - if (sync_cache_restrict) - { -#if (ABEtype != 1) - for (int i = 0; i < GH->levels; i++) - sync_cache_restrict[i].destroy(); -#endif - delete[] sync_cache_restrict; - } - if (sync_cache_outbd) - { -#if (ABEtype != 1) - for (int i = 0; i < GH->levels; i++) - sync_cache_outbd[i].destroy(); -#endif - delete[] sync_cache_outbd; - } -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] caches freed" << endl; - cout.flush(); - } -#endif + + // Destroy sync caches before GH + destroy_transfer_caches(); delete GH; #ifdef WithShell delete SH; #endif -#if (ABEtype == 1) - if (myrank == 0) + + for (int i = 0; i < BH_num; i++) { - cout << "[dtor] grids freed" << endl; - cout.flush(); - } -#endif - - for (int i = 0; i < BH_num; i++) - { delete[] Porg0[i]; delete[] Porgbr[i]; delete[] Porg[i]; @@ -1380,17 +1284,10 @@ bssn_class::~bssn_class() delete[] Porg; delete[] Porg1; delete[] Porg_rhs; - + delete[] Mass; delete[] Spin; delete[] Pmom; -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] puncture arrays freed" << endl; - cout.flush(); - } -#endif delete ErrorMonitor; delete Psi4Monitor; @@ -1399,22 +1296,8 @@ bssn_class::~bssn_class() delete ConVMonitor; delete TimingMonitor; delete Waveshell; -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] monitors freed" << endl; - cout.flush(); - } -#endif delete CheckPoint; -#if (ABEtype == 1) - if (myrank == 0) - { - cout << "[dtor] checkpoint freed" << endl; - cout.flush(); - } -#endif } //================================================================================================ @@ -2599,9 +2482,7 @@ void bssn_class::Evolve(int Steps) GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif + invalidate_transfer_caches(); STEP_TIMER_ADD(TB_REGRID, timer_regrid); #endif @@ -2842,9 +2723,7 @@ void bssn_class::RecursiveStep(int lev) { if (ConstraintRefreshLevels) ConstraintRefreshLevels[lev] = 1; -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif + invalidate_transfer_caches(); } STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel); #endif @@ -3022,13 +2901,11 @@ void bssn_class::ParallelStep() delete[] tporg; delete[] tporgo; #if (REGLEV == 0) - if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, - SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + if (GH->Regrid_Onelevel(GH->mylev, Symmetry, BH_num, Porgbr, Porg0, + SynchList_cor, OldStateList, StateList, SynchList_pre, + fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) + invalidate_transfer_caches(); #endif -#endif } //================================================================================================ @@ -3191,12 +3068,10 @@ void bssn_class::ParallelStep() if (lev + 1 >= GH->movls) { // GH->Regrid_Onelevel_aux(lev,Symmetry,BH_num,Porgbr,Porg0, - if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, - SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif + if (GH->Regrid_Onelevel(lev + 1, Symmetry, BH_num, Porgbr, Porg0, + SynchList_cor, OldStateList, StateList, SynchList_pre, + fgt(PhysTime - dT_levp1, StartTime, dT_levp1 / 2), ErrorMonitor)) + invalidate_transfer_caches(); // a_stream.clear(); // a_stream.str(""); @@ -3208,12 +3083,10 @@ void bssn_class::ParallelStep() // for this level if (YN == 1) { - if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, - SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif + if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, + SynchList_cor, OldStateList, StateList, SynchList_pre, + fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) + invalidate_transfer_caches(); // a_stream.clear(); // a_stream.str(""); @@ -3229,12 +3102,10 @@ void bssn_class::ParallelStep() if (YN == 1) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, - if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, - SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif + if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, + SynchList_cor, OldStateList, StateList, SynchList_pre, + fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) + invalidate_transfer_caches(); // a_stream.clear(); // a_stream.str(""); @@ -3247,12 +3118,10 @@ void bssn_class::ParallelStep() if (i % 4 == 3) { // GH->Regrid_Onelevel_aux(lev-2,Symmetry,BH_num,Porgbr,Porg0, - if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, - SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) -#if (ABEtype != 1) - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif + if (GH->Regrid_Onelevel(lev - 1, Symmetry, BH_num, Porgbr, Porg0, + SynchList_cor, OldStateList, StateList, SynchList_pre, + fgt(PhysTime - dT_lev, StartTime, dT_levm1 / 2), ErrorMonitor)) + invalidate_transfer_caches(); // a_stream.clear(); // a_stream.str(""); @@ -3783,11 +3652,7 @@ void bssn_class::Step(int lev, int YN) STEP_TIMER_DECL(timer_predictor_sync); Parallel::AsyncSyncState async_pre; -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); -#else - Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); -#endif + sync_predictor_start(lev, SynchList_pre, async_pre); #ifdef WithShell if (lev == 0) @@ -3806,9 +3671,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1) - Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); -#endif + sync_predictor_finish(lev, async_pre, SynchList_pre); #ifdef WithShell // Complete non-blocking error reduction and check @@ -4154,11 +4017,7 @@ void bssn_class::Step(int lev, int YN) STEP_TIMER_DECL(timer_corrector_sync); Parallel::AsyncSyncState async_cor; -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); -#else - Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); -#endif + sync_corrector_start(lev, SynchList_cor, async_cor); #ifdef WithShell if (lev == 0) @@ -4177,9 +4036,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1) - Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); -#endif + sync_corrector_finish(lev, async_cor, SynchList_cor); #ifdef WithShell // Complete non-blocking error reduction and check @@ -4664,15 +4521,11 @@ void bssn_class::Step(int lev, int YN) { int erh = ERROR; MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); - } -#endif - - Parallel::AsyncSyncState async_pre; -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); -#else - Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + } #endif + + Parallel::AsyncSyncState async_pre; + sync_predictor_start(lev, SynchList_pre, async_pre); #ifdef WithShell if (lev == 0) @@ -4691,9 +4544,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1) - Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); -#endif + sync_predictor_finish(lev, async_pre, SynchList_pre); #ifdef WithShell // Complete non-blocking error reduction and check @@ -5021,12 +4872,8 @@ void bssn_class::Step(int lev, int YN) } #endif - Parallel::AsyncSyncState async_cor; -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); -#else - Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); -#endif + Parallel::AsyncSyncState async_cor; + sync_corrector_start(lev, SynchList_cor, async_cor); #ifdef WithShell if (lev == 0) @@ -5045,9 +4892,7 @@ void bssn_class::Step(int lev, int YN) } } #endif -#if (ABEtype != 1) - Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry); -#endif + sync_corrector_finish(lev, async_cor, SynchList_cor); #ifdef WithShell // Complete non-blocking error reduction and check @@ -5439,11 +5284,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Predictor sync"); -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]); -#endif + sync_evolution(lev, SynchList_pre, sync_cache_pre); // Complete non-blocking error reduction and check MPI_Wait(&err_req, MPI_STATUS_IGNORE); @@ -5644,11 +5485,7 @@ void bssn_class::Step(int lev, int YN) // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Corrector sync"); -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]); -#endif + sync_evolution(lev, SynchList_cor, sync_cache_cor); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"after Corrector sync"); @@ -6365,15 +6202,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) -#if (ABEtype == 1) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry); -#else - Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, Symmetry, sync_cache_restrict[lev]); -#endif + restrict_evolution(lev, SL, SynchList_pre); #elif (RPB == 1) - // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); - Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); -#endif + // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SynchList_pre,Symmetry); + Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SynchList_pre, GH->rsul[lev], Symmetry); +#endif #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -6382,11 +6215,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); -#endif + sync_evolution(lev - 1, SynchList_pre, sync_cache_rp_coarse); #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -6397,21 +6226,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SL, Symmetry); - Pp = Pp->next; - } - Ppc = Ppc->next; - } -#else - Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry, sync_cache_outbd[lev]); -#endif + outbdlow2hi_evolution(lev, SynchList_pre, SL); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SL, Symmetry); #endif @@ -6438,15 +6253,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #endif #if (RPB == 0) -#if (ABEtype == 1) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); -#else - Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_restrict[lev]); -#endif + restrict_evolution(lev, SL, SL); #elif (RPB == 1) - // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); - Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); -#endif + // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SL,SL,Symmetry); + Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->rsul[lev], Symmetry); +#endif #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -6455,11 +6266,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); #endif -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev - 1], SL, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev - 1], SL, Symmetry, sync_cache_rp_coarse[lev]); -#endif + sync_evolution(lev - 1, SL, sync_cache_rp_coarse); #if (PSTR == 1 || PSTR == 2) // a_stream.clear(); @@ -6470,21 +6277,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SL, SL, Symmetry); - Pp = Pp->next; - } - Ppc = Ppc->next; - } -#else - Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry, sync_cache_outbd[lev]); -#endif + outbdlow2hi_evolution(lev, SL, SL); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, Symmetry); #endif @@ -6685,7 +6478,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, #if (ABEtype == 1) Parallel::Sync(GH->PatL[lev], SL, Symmetry); #else - Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); + sync_evolution(lev, SL, sync_cache_rp_fine); #endif } STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); @@ -6818,39 +6611,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) } #if (RPB == 0) -#if (ABEtype == 1) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry); -#else - Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, Symmetry, sync_cache_restrict[lev]); -#endif + restrict_evolution(lev, SynchList_cor, SynchList_pre); #elif (RPB == 1) - // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry); - Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); -#endif - -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev - 1], SynchList_pre, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev - 1], SynchList_pre, Symmetry, sync_cache_rp_coarse[lev]); + // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,SynchList_pre,Symmetry); + Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, SynchList_pre, GH->rsul[lev], Symmetry); #endif + + sync_evolution(lev - 1, SynchList_pre, sync_cache_rp_coarse); #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, SynchList_pre, SynchList_cor, Symmetry); - Pp = Pp->next; - } - Ppc = Ppc->next; - } -#else - Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); -#endif + outbdlow2hi_evolution(lev, SynchList_pre, SynchList_cor); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); #endif @@ -6864,39 +6635,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) if (myrank == 0) cout << "===: " << GH->Lt[lev - 1] << "," << GH->Lt[lev] + dT_lev << endl; #if (RPB == 0) -#if (ABEtype == 1) - Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); -#else - Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry, sync_cache_restrict[lev]); -#endif + restrict_evolution(lev, SynchList_cor, StateList); #elif (RPB == 1) - // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry); - Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); -#endif - -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev - 1], StateList, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); + // Parallel::Restrict_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_cor,StateList,Symmetry); + Parallel::Restrict_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, GH->rsul[lev], Symmetry); #endif + + sync_evolution(lev - 1, StateList, sync_cache_rp_coarse); #if (RPB == 0) #if (MIXOUTB == 0) -#if (ABEtype == 1) - Ppc = GH->PatL[lev - 1]; - while (Ppc) - { - Pp = GH->PatL[lev]; - while (Pp) - { - Parallel::OutBdLow2Hi(Ppc->data, Pp->data, StateList, SynchList_cor, Symmetry); - Pp = Pp->next; - } - Ppc = Ppc->next; - } -#else - Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); -#endif + outbdlow2hi_evolution(lev, StateList, SynchList_cor); #elif (MIXOUTB == 1) Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); #endif @@ -6906,11 +6655,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) #endif } -#if (ABEtype == 1) - Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry); -#else - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); -#endif + sync_evolution(lev, SynchList_cor, sync_cache_rp_fine); } STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); } @@ -6940,12 +6685,12 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) Pp = Pp->next; } -#if (RPB == 0) -#if (MIXOUTB == 0) - Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry, sync_cache_outbd[lev]); -#elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); -#endif +#if (RPB == 0) +#if (MIXOUTB == 0) + outbdlow2hi_evolution(lev, SynchList_pre, SynchList_cor); +#elif (MIXOUTB == 1) + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, Symmetry); +#endif #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],SynchList_pre,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SynchList_pre, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6953,12 +6698,12 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) } else // no time refinement levels and for all same time levels { -#if (RPB == 0) -#if (MIXOUTB == 0) - Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry, sync_cache_outbd[lev]); -#elif (MIXOUTB == 1) - Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); -#endif +#if (RPB == 0) +#if (MIXOUTB == 0) + outbdlow2hi_evolution(lev, StateList, SynchList_cor); +#elif (MIXOUTB == 1) + Parallel::OutBdLow2Himix(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, Symmetry); +#endif #elif (RPB == 1) // Parallel::OutBdLow2Hi_bam(GH->PatL[lev-1],GH->PatL[lev],StateList,SynchList_cor,Symmetry); Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); @@ -6974,12 +6719,12 @@ void bssn_class::ProlongRestrict(int lev, int YN, bool BB) #else Parallel::Restrict_after(GH->PatL[lev - 1], GH->PatL[lev], SynchList_cor, StateList, Symmetry); #endif - Parallel::Sync_cached(GH->PatL[lev - 1], StateList, Symmetry, sync_cache_rp_coarse[lev]); - } - - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); - } -} + sync_evolution(lev - 1, StateList, sync_cache_rp_coarse); + } + + sync_evolution(lev, SynchList_cor, sync_cache_rp_fine); + } +} #undef MIXOUTB //================================================================================================ @@ -7707,6 +7452,169 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va } } } + +bool bssn_class::use_transfer_cache() const +{ +#if BSSN_USE_TRANSFER_CACHE + return true; +#else + return false; +#endif +} + +void bssn_class::setup_transfer_caches() +{ + sync_cache_pre = 0; + sync_cache_cor = 0; + sync_cache_rp_coarse = 0; + sync_cache_rp_fine = 0; + sync_cache_restrict = 0; + sync_cache_outbd = 0; + + if (!use_transfer_cache() || !GH) + return; + + sync_cache_pre = new Parallel::SyncCache[GH->levels]; + sync_cache_cor = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels]; + sync_cache_rp_fine = new Parallel::SyncCache[GH->levels]; + sync_cache_restrict = new Parallel::SyncCache[GH->levels]; + sync_cache_outbd = new Parallel::SyncCache[GH->levels]; +} + +void bssn_class::invalidate_transfer_caches() +{ + if (!use_transfer_cache() || !GH || !sync_cache_pre || !sync_cache_cor || + !sync_cache_rp_coarse || !sync_cache_rp_fine || !sync_cache_restrict || !sync_cache_outbd) + return; + + for (int il = 0; il < GH->levels; il++) + { + sync_cache_pre[il].invalidate(); + sync_cache_cor[il].invalidate(); + sync_cache_rp_coarse[il].invalidate(); + sync_cache_rp_fine[il].invalidate(); + sync_cache_restrict[il].invalidate(); + sync_cache_outbd[il].invalidate(); + } +} + +void bssn_class::destroy_transfer_caches() +{ + if (sync_cache_pre) + { + if (use_transfer_cache() && GH) + for (int i = 0; i < GH->levels; i++) + sync_cache_pre[i].destroy(); + delete[] sync_cache_pre; + sync_cache_pre = 0; + } + if (sync_cache_cor) + { + if (use_transfer_cache() && GH) + for (int i = 0; i < GH->levels; i++) + sync_cache_cor[i].destroy(); + delete[] sync_cache_cor; + sync_cache_cor = 0; + } + if (sync_cache_rp_coarse) + { + if (use_transfer_cache() && GH) + for (int i = 0; i < GH->levels; i++) + sync_cache_rp_coarse[i].destroy(); + delete[] sync_cache_rp_coarse; + sync_cache_rp_coarse = 0; + } + if (sync_cache_rp_fine) + { + if (use_transfer_cache() && GH) + for (int i = 0; i < GH->levels; i++) + sync_cache_rp_fine[i].destroy(); + delete[] sync_cache_rp_fine; + sync_cache_rp_fine = 0; + } + if (sync_cache_restrict) + { + if (use_transfer_cache() && GH) + for (int i = 0; i < GH->levels; i++) + sync_cache_restrict[i].destroy(); + delete[] sync_cache_restrict; + sync_cache_restrict = 0; + } + if (sync_cache_outbd) + { + if (use_transfer_cache() && GH) + for (int i = 0; i < GH->levels; i++) + sync_cache_outbd[i].destroy(); + delete[] sync_cache_outbd; + sync_cache_outbd = 0; + } +} + +void bssn_class::sync_predictor_start(int lev, MyList *VarList, Parallel::AsyncSyncState &async_state) +{ + if (use_transfer_cache()) + Parallel::Sync_start(GH->PatL[lev], VarList, Symmetry, sync_cache_pre[lev], async_state); + else + Parallel::Sync(GH->PatL[lev], VarList, Symmetry); +} + +void bssn_class::sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList *VarList) +{ + if (use_transfer_cache()) + Parallel::Sync_finish(sync_cache_pre[lev], async_state, VarList, Symmetry); +} + +void bssn_class::sync_corrector_start(int lev, MyList *VarList, Parallel::AsyncSyncState &async_state) +{ + if (use_transfer_cache()) + Parallel::Sync_start(GH->PatL[lev], VarList, Symmetry, sync_cache_cor[lev], async_state); + else + Parallel::Sync(GH->PatL[lev], VarList, Symmetry); +} + +void bssn_class::sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList *VarList) +{ + if (use_transfer_cache()) + Parallel::Sync_finish(sync_cache_cor[lev], async_state, VarList, Symmetry); +} + +void bssn_class::sync_evolution(int lev, MyList *VarList, Parallel::SyncCache *cache_array) +{ + if (use_transfer_cache() && cache_array) + Parallel::Sync_cached(GH->PatL[lev], VarList, Symmetry, cache_array[lev]); + else + Parallel::Sync(GH->PatL[lev], VarList, Symmetry); +} + +void bssn_class::restrict_evolution(int lev, MyList *src_var_list, MyList *dst_var_list) +{ + if (use_transfer_cache()) + Parallel::Restrict_cached(GH->PatL[lev - 1], GH->PatL[lev], src_var_list, dst_var_list, Symmetry, sync_cache_restrict[lev]); + else + Parallel::Restrict(GH->PatL[lev - 1], GH->PatL[lev], src_var_list, dst_var_list, Symmetry); +} + +void bssn_class::outbdlow2hi_evolution(int lev, MyList *src_var_list, MyList *dst_var_list) +{ + if (use_transfer_cache()) + { + Parallel::OutBdLow2Hi_cached(GH->PatL[lev - 1], GH->PatL[lev], src_var_list, dst_var_list, Symmetry, sync_cache_outbd[lev]); + return; + } + + MyList *Ppc = GH->PatL[lev - 1]; + while (Ppc) + { + MyList *Pp = GH->PatL[lev]; + while (Pp) + { + Parallel::OutBdLow2Hi(Ppc->data, Pp->data, src_var_list, dst_var_list, Symmetry); + Pp = Pp->next; + } + Ppc = Ppc->next; + } +} #endif //================================================================================================ diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index a2536cb..eb78a16 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -31,11 +31,19 @@ using namespace std; #include "surface_integral.h" #include "checkpoint.h" -extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN); - -class bssn_class -{ -public: +extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN); + +#ifndef BSSN_USE_TRANSFER_CACHE +#define BSSN_USE_TRANSFER_CACHE 1 +#endif + +#ifndef BSSN_USE_ESCALAR_C_KERNEL +#define BSSN_USE_ESCALAR_C_KERNEL 1 +#endif + +class bssn_class +{ +public: int ngfs; int nprocs, myrank; cgh *GH; @@ -167,14 +175,25 @@ public: void Setup_KerrSchild(); void Enforce_algcon(int lev, int fg); - void testRestrict(); - void testOutBd(); - - bool check_Stdin_Abort(); - - virtual void Setup_Initial_Data_Cao(); - virtual void Setup_Initial_Data_Lousto(); - virtual void Initialize(); + void testRestrict(); + void testOutBd(); + + bool check_Stdin_Abort(); + bool use_transfer_cache() const; + void setup_transfer_caches(); + void invalidate_transfer_caches(); + void destroy_transfer_caches(); + void sync_predictor_start(int lev, MyList *VarList, Parallel::AsyncSyncState &async_state); + void sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList *VarList); + void sync_corrector_start(int lev, MyList *VarList, Parallel::AsyncSyncState &async_state); + void sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList *VarList); + void sync_evolution(int lev, MyList *VarList, Parallel::SyncCache *cache_array = 0); + void restrict_evolution(int lev, MyList *src_var_list, MyList *dst_var_list); + void outbdlow2hi_evolution(int lev, MyList *src_var_list, MyList *dst_var_list); + + virtual void Setup_Initial_Data_Cao(); + virtual void Setup_Initial_Data_Lousto(); + virtual void Initialize(); virtual void Read_Ansorg(); virtual void Read_Pablo() {}; virtual void Compute_Psi4(int lev); diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 8953ecd..231d31e 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -2,11 +2,19 @@ include makefile.inc +ifeq ($(USE_CXX_ESCALAR_KERNEL),1) +ifeq ($(USE_CXX_KERNELS),0) +$(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar_rhs_c.C reuses the C BSSN kernel) +endif +endif + ## polint(ordn=6) kernel selector: ## 1 (default): barycentric fast path ## 0 : fallback to Neville path POLINT6_USE_BARY ?= 1 POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) +TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(USE_TRANSFER_CACHE) +ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(USE_CXX_ESCALAR_KERNEL) ## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) ## make -> opt (PGO-guided, maximum performance) @@ -16,7 +24,8 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata ifeq ($(PGO_MODE),instrument) ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ + $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) else @@ -26,7 +35,8 @@ else CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) + -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ + $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) endif @@ -87,7 +97,10 @@ ifeq ($(USE_CXX_KERNELS),0) CFILES = else # C++ mode (default): C rewrite of bssn/bssn-escalar rhs and helper kernels -CFILES = bssn_rhs_c.o bssn_escalar_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o +CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o +ifeq ($(USE_CXX_ESCALAR_KERNEL),1) +CFILES += bssn_escalar_rhs_c.o +endif endif ## RK4 kernel switch (independent from USE_CXX_KERNELS) diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 331cff1..378b926 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -48,6 +48,17 @@ endif ## 0 : fall back to original Fortran kernels USE_CXX_KERNELS ?= 1 +## BSSN-EScalar RHS switch +## 1 : use BSSN-EScalar C wrapper on the normal patch path +## 0 (default) : keep the original Fortran BSSN-EScalar RHS for precision-safe runs +## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel. +USE_CXX_ESCALAR_KERNEL ?= 0 + +## Cached transfer switch +## 1 : enable cached Sync/Restrict/OutBd transfer on evolution hot paths +## 0 (default) : keep the original uncached transfer path for precision-safe runs +USE_TRANSFER_CACHE ?= 0 + ## RK4 kernel implementation switch ## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments) ## 0 : use original Fortran rungekutta4_rout.o