From 968522995b1b14f09c92a39830019e894a19f1ca Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Mon, 13 Apr 2026 14:50:55 +0800 Subject: [PATCH] Add fine-grained step timing and trim BH RHS overhead --- AMSS_NCKU_source/bssn_class.C | 674 ++++++++++++++++++++++------------ AMSS_NCKU_source/bssn_class.h | 6 +- AMSS_NCKU_source/macrodef.h | 15 + generate_macrodef.py | 37 ++ 4 files changed, 497 insertions(+), 235 deletions(-) diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 1b07ea9..2614ac6 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -47,6 +47,157 @@ using namespace std; #define BSSN_ENABLE_MEM_USAGE_LOG 0 #endif +#ifndef BSSN_FINE_TIMING +#define BSSN_FINE_TIMING 0 +#endif + +#ifndef BSSN_FINE_TIMING_EVERY +#define BSSN_FINE_TIMING_EVERY 1 +#endif + +#ifndef BSSN_FINE_TIMING_TOPN +#define BSSN_FINE_TIMING_TOPN 8 +#endif + +#if BSSN_FINE_TIMING +namespace step_timing +{ + enum Bucket + { + TB_ANALYSIS_PSI4 = 0, + TB_ANALYSIS_SURFACE, + TB_ANALYSIS_IO, + TB_BH_PREDICTOR, + TB_PREDICTOR_RHS, + TB_PREDICTOR_SYNC, + TB_BH_CORRECTOR, + TB_CORRECTOR_RHS, + TB_CORRECTOR_SYNC, + TB_STATE_SWAP, + TB_RESTRICT_PROLONG, + TB_CONSTRAINT_OUT, + TB_DUMP_3D, + TB_DUMP_2D, + TB_CHECKPOINT, + TB_REGRID, + TB_COUNT + }; + + static double local_bucket_seconds[TB_COUNT]; + + static const char *bucket_labels[TB_COUNT] = + { + "analysis_psi4", + "analysis_surface", + "analysis_io", + "bh_predictor", + "predictor_rhs", + "predictor_sync", + "bh_corrector", + "corrector_rhs", + "corrector_sync", + "state_swap", + "restrict_prolong", + "constraint_out", + "dump_3d", + "dump_2d", + "checkpoint", + "regrid" + }; + + void reset() + { + for (int i = 0; i < TB_COUNT; i++) + local_bucket_seconds[i] = 0.0; + } + + void add(Bucket bucket, double seconds) + { + local_bucket_seconds[int(bucket)] += seconds; + } + + void report(int myrank, int nprocs, monitor *TimingMonitor, + int step_index, double phys_time, double step_wall_seconds) + { + double max_bucket_seconds[TB_COUNT]; + double avg_bucket_seconds[TB_COUNT]; + + MPI_Reduce(local_bucket_seconds, max_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(local_bucket_seconds, avg_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (myrank != 0) + return; + + for (int i = 0; i < TB_COUNT; i++) + avg_bucket_seconds[i] /= Mymax(1, nprocs); + + if (TimingMonitor) + { + double row[2 + 2 * TB_COUNT]; + row[0] = double(step_index); + row[1] = step_wall_seconds; + for (int i = 0; i < TB_COUNT; i++) + { + row[2 + i] = max_bucket_seconds[i]; + row[2 + TB_COUNT + i] = avg_bucket_seconds[i]; + } + TimingMonitor->writefile(phys_time, 2 + 2 * TB_COUNT, row); + } + + double residual = step_wall_seconds; + for (int i = 0; i < TB_COUNT; i++) + residual -= max_bucket_seconds[i]; + if (residual < 0.0) + residual = 0.0; + + int order[TB_COUNT]; + for (int i = 0; i < TB_COUNT; i++) + order[i] = i; + + for (int i = 0; i < TB_COUNT - 1; i++) + for (int j = i + 1; j < TB_COUNT; j++) + if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]]) + { + int tmp = order[i]; + order[i] = order[j]; + order[j] = tmp; + } + + ios::fmtflags old_flags = cout.flags(); + streamsize old_precision = cout.precision(); + + cout << " Fine timing hot spots (max rank wall estimate):" << endl; + const int topn = Mymin(BSSN_FINE_TIMING_TOPN, TB_COUNT); + for (int i = 0; i < topn; i++) + { + const int ib = order[i]; + const double frac = (step_wall_seconds > 0.0) ? (100.0 * max_bucket_seconds[ib] / step_wall_seconds) : 0.0; + cout << " " + << setw(20) << left << bucket_labels[ib] + << " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib] + << " s (" << setw(6) << setprecision(4) << frac << "%)" << endl; + } + if (residual > 1.0e-6) + { + const double frac = (step_wall_seconds > 0.0) ? (100.0 * residual / step_wall_seconds) : 0.0; + cout << " " + << setw(20) << left << "unprofiled_residual" + << " = " << setw(10) << right << setprecision(6) << residual + << " s (" << setw(6) << setprecision(4) << frac << "%)" << endl; + } + cout << endl; + cout.flags(old_flags); + cout.precision(old_precision); + } +} + +#define STEP_TIMER_DECL(var_name) const double var_name = MPI_Wtime() +#define STEP_TIMER_ADD(bucket_name, var_name) step_timing::add(step_timing::bucket_name, MPI_Wtime() - (var_name)) +#else +#define STEP_TIMER_DECL(var_name) +#define STEP_TIMER_ADD(bucket_name, var_name) +#endif + //================================================================================================ // define bssn_class @@ -104,11 +255,29 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei, a_stream << setw(15) << "# time ADMmass ADMPx ADMPy ADMPz ADMSx ADMSy ADMSz"; MAPMonitor = new monitor("bssn_ADMQs.dat", myrank, a_stream.str()); - a_stream.clear(); - a_stream.str(""); - a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz"; - ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str()); - } + a_stream.clear(); + a_stream.str(""); + a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz"; + ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str()); + +#if BSSN_FINE_TIMING + a_stream.clear(); + a_stream.str(""); + a_stream << setw(8) << "# step"; + a_stream << setw(14) << "wall"; + for (int ib = 0; ib < step_timing::TB_COUNT; ib++) + a_stream << setw(18) << step_timing::bucket_labels[ib]; + for (int ib = 0; ib < step_timing::TB_COUNT; ib++) + { + char str_avg[64]; + sprintf(str_avg, "avg_%s", step_timing::bucket_labels[ib]); + a_stream << setw(18) << str_avg; + } + TimingMonitor = new monitor("bssn_step_timing.dat", myrank, a_stream.str()); +#else + TimingMonitor = 0; +#endif + } // setup sphere integration engine Waveshell = new surface_integral(Symmetry); @@ -1053,10 +1222,11 @@ bssn_class::~bssn_class() delete ErrorMonitor; delete Psi4Monitor; - delete BHMonitor; - delete MAPMonitor; - delete ConVMonitor; - delete Waveshell; + delete BHMonitor; + delete MAPMonitor; + delete ConVMonitor; + delete TimingMonitor; + delete Waveshell; delete CheckPoint; } @@ -2151,9 +2321,13 @@ void bssn_class::Evolve(int Steps) GH->settrfls(trfls); - for (int ncount = 1; ncount < Steps + 1; ncount++) - { - // special for large mass ratio consideration + for (int ncount = 1; ncount < Steps + 1; ncount++) + { +#if BSSN_FINE_TIMING + step_timing::reset(); + STEP_TIMER_DECL(step_wall_start); +#endif + // special for large mass ratio consideration // if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) // { GH->levels=GH->movls; } @@ -2177,17 +2351,19 @@ void bssn_class::Evolve(int Steps) LastCheck += dT_mon; // When LastDump >= DumpTime, output corresponding binary data - if (LastDump >= DumpTime) - { - // misc::tillherecheck("before Dump_Data"); - - for (int lev = 0; lev < GH->levels; lev++) - Parallel::Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); -#ifdef WithShell - SH->Dump_Data(DumpList, 0, PhysTime, dT_mon); -#endif - - LastDump = 0; + if (LastDump >= DumpTime) + { + STEP_TIMER_DECL(timer_dump3d); + // misc::tillherecheck("before Dump_Data"); + + for (int lev = 0; lev < GH->levels; lev++) + Parallel::Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); +#ifdef WithShell + SH->Dump_Data(DumpList, 0, PhysTime, dT_mon); +#endif + STEP_TIMER_ADD(TB_DUMP_3D, timer_dump3d); + + LastDump = 0; if (myrank == 0) { @@ -2196,14 +2372,16 @@ void bssn_class::Evolve(int Steps) } // When Last2dDump >= d2DumpTime, output corresponding 2D data - if (Last2dDump >= d2DumpTime) - { - // misc::tillherecheck("before 2dDump_Data"); - - for (int lev = 0; lev < GH->levels; lev++) - Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); - - Last2dDump = 0; + if (Last2dDump >= d2DumpTime) + { + STEP_TIMER_DECL(timer_dump2d); + // misc::tillherecheck("before 2dDump_Data"); + + for (int lev = 0; lev < GH->levels; lev++) + Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon); + STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d); + + Last2dDump = 0; if (myrank == 0) { @@ -2225,12 +2403,14 @@ void bssn_class::Evolve(int Steps) if (PhysTime >= TotalTime) break; -#if (REGLEV == 1) - GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, - SynchList_cor, OldStateList, StateList, SynchList_pre, - fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); - for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } -#endif +#if (REGLEV == 1) + STEP_TIMER_DECL(timer_regrid); + GH->Regrid(Symmetry, BH_num, Porgbr, Porg0, + SynchList_cor, OldStateList, StateList, SynchList_pre, + fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor); + for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } + STEP_TIMER_ADD(TB_REGRID, timer_regrid); +#endif #if (REGLEV == 0 && (PSTR == 1 || PSTR == 2)) // GH->Regrid_fake(Symmetry,BH_num,Porgbr,Porg0, @@ -2302,18 +2482,25 @@ void bssn_class::Evolve(int Steps) //////////////////////////////////////////////////////////// // When LastCheck >= CheckTime, perform runtime checks and output status data - if (LastCheck >= CheckTime) - { - LastCheck = 0; - - CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); - CheckPoint->writecheck_cgh(PhysTime, GH); -#ifdef WithShell - CheckPoint->writecheck_sh(PhysTime, SH); -#endif - CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); - } - } + if (LastCheck >= CheckTime) + { + STEP_TIMER_DECL(timer_checkpoint); + LastCheck = 0; + + CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass); + CheckPoint->writecheck_cgh(PhysTime, GH); +#ifdef WithShell + CheckPoint->writecheck_sh(PhysTime, SH); +#endif + CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas); + STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint); + } + +#if BSSN_FINE_TIMING + if (ncount % BSSN_FINE_TIMING_EVERY == 0) + step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, MPI_Wtime() - step_wall_start); +#endif + } /* #ifdef With_AHF // final apparent horizon finding @@ -2444,6 +2631,7 @@ void bssn_class::RecursiveStep(int lev) #endif #if (REGLEV == 0) + STEP_TIMER_DECL(timer_regrid_onelevel); if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0, SynchList_cor, OldStateList, StateList, SynchList_pre, fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor)) @@ -2452,6 +2640,7 @@ void bssn_class::RecursiveStep(int lev) ConstraintRefreshLevels[lev] = 1; for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); } } + STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel); #endif } @@ -3042,11 +3231,12 @@ void bssn_class::Step(int lev, int YN) double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); -// new code 2013-2-15, zjcao -#if (MAPBH == 1) - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { +// new code 2013-2-15, zjcao +#if (MAPBH == 1) + STEP_TIMER_DECL(timer_bh_predictor); + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { @@ -3070,12 +3260,13 @@ void bssn_class::Step(int lev, int YN) DG_List->insert(Sfy0); DG_List->insert(Sfz0); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); - DG_List->clearList(); - } - } - } - - // data analysis part + DG_List->clearList(); + } + } + } + STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor); + + // data analysis part // Warning NOTE: the variables1 are used as temp storege room if (lev == a_lev) { @@ -3093,11 +3284,12 @@ void bssn_class::Step(int lev, int YN) double TRK4 = PhysTime; int iter_count = 0; // count RK4 substeps int pre = 0, cor = 1; - int ERROR = 0; - - MyList *sPp; - // Predictor - MyList *Pp = GH->PatL[lev]; + int ERROR = 0; + + MyList *sPp; + STEP_TIMER_DECL(timer_predictor_rhs); + // Predictor + MyList *Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -3365,14 +3557,17 @@ void bssn_class::Step(int lev, int YN) // Non-blocking error reduction overlapped with Sync to hide Allreduce latency MPI_Request err_req; - { - int erh = ERROR; - MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); - } -#endif - - Parallel::AsyncSyncState async_pre; - Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + { + int erh = ERROR; + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req); + } +#endif + + STEP_TIMER_ADD(TB_PREDICTOR_RHS, timer_predictor_rhs); + + STEP_TIMER_DECL(timer_predictor_sync); + Parallel::AsyncSyncState async_pre; + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); #ifdef WithShell if (lev == 0) @@ -3393,9 +3588,9 @@ void bssn_class::Step(int lev, int YN) #endif Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry); -#ifdef WithShell - // Complete non-blocking error reduction and check - MPI_Wait(&err_req, MPI_STATUS_IGNORE); +#ifdef WithShell + // Complete non-blocking error reduction and check + MPI_Wait(&err_req, MPI_STATUS_IGNORE); if (ERROR) { Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev); @@ -3404,12 +3599,13 @@ void bssn_class::Step(int lev, int YN) { if (ErrorMonitor->outfile) ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime << ", lev = " << lev << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - } -#endif - -#if (MAPBH == 0) + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +#endif + STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync); + +#if (MAPBH == 0) // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -3449,12 +3645,13 @@ void bssn_class::Step(int lev, int YN) } #endif - // corrector - for (iter_count = 1; iter_count < 4; iter_count++) - { - // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; - if (iter_count == 1 || iter_count == 3) - TRK4 += dT_lev / 2; + // corrector + for (iter_count = 1; iter_count < 4; iter_count++) + { + STEP_TIMER_DECL(timer_corrector_rhs); + // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt; + if (iter_count == 1 || iter_count == 3) + TRK4 += dT_lev / 2; Pp = GH->PatL[lev]; while (Pp) { @@ -3725,14 +3922,17 @@ void bssn_class::Step(int lev, int YN) } // Non-blocking error reduction overlapped with Sync to hide Allreduce latency MPI_Request err_req_cor; - { - int erh = ERROR; - MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); - } -#endif - - Parallel::AsyncSyncState async_cor; - Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); + { + int erh = ERROR; + MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor); + } +#endif + + STEP_TIMER_ADD(TB_CORRECTOR_RHS, timer_corrector_rhs); + + STEP_TIMER_DECL(timer_corrector_sync); + Parallel::AsyncSyncState async_cor; + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); #ifdef WithShell if (lev == 0) @@ -3767,14 +3967,16 @@ void bssn_class::Step(int lev, int YN) << " variables at t = " << PhysTime << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); - } - } -#endif - -#if (MAPBH == 0) - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { + } + } +#endif + STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync); + +#if (MAPBH == 0) + STEP_TIMER_DECL(timer_bh_corrector); + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { @@ -3800,16 +4002,18 @@ void bssn_class::Step(int lev, int YN) DG_List->insert(Sfy0); DG_List->insert(Sfz0); Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev); - DG_List->clearList(); - } - } - } -#endif - - // swap time level - if (iter_count < 3) - { - Pp = GH->PatL[lev]; + DG_List->clearList(); + } + } + } + STEP_TIMER_ADD(TB_BH_CORRECTOR, timer_bh_corrector); +#endif + + // swap time level + if (iter_count < 3) + { + STEP_TIMER_DECL(timer_state_swap); + Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -3839,11 +4043,11 @@ void bssn_class::Step(int lev, int YN) BP = BP->next; } sPp = sPp->next; - } - } -#endif - -#if (MAPBH == 0) + } + } +#endif + +#if (MAPBH == 0) // for black hole position if (BH_num > 0 && lev == GH->levels - 1) { @@ -3852,14 +4056,16 @@ void bssn_class::Step(int lev, int YN) Porg[ithBH][0] = Porg1[ithBH][0]; Porg[ithBH][1] = Porg1[ithBH][1]; Porg[ithBH][2] = Porg1[ithBH][2]; - } - } -#endif - } - } -#if (RPS == 0) - // mesh refinement boundary part - RestrictProlong(lev, YN, BB); + } + } +#endif + STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_swap); + } + } +#if (RPS == 0) + STEP_TIMER_DECL(timer_restrict_prolong); + // mesh refinement boundary part + RestrictProlong(lev, YN, BB); #ifdef WithShell if (lev == 0) @@ -3876,17 +4082,19 @@ void bssn_class::Step(int lev, int YN) << " seconds! " << endl; } } -#endif - -#endif - // note the data structure before update +#endif + + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); +#endif + // note the data structure before update // SynchList_cor 1 ----------- // // StateList 0 ----------- // - // OldStateList old ----------- - // update - Pp = GH->PatL[lev]; + // OldStateList old ----------- + // update + STEP_TIMER_DECL(timer_state_commit); + Pp = GH->PatL[lev]; while (Pp) { MyList *BP = Pp->data->blb; @@ -3939,10 +4147,11 @@ void bssn_class::Step(int lev, int YN) { Porg0[ithBH][0] = Porg1[ithBH][0]; Porg0[ithBH][1] = Porg1[ithBH][1]; - Porg0[ithBH][2] = Porg1[ithBH][2]; - } - } -} + Porg0[ithBH][2] = Porg1[ithBH][2]; + } + } + STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_commit); +} //================================================================================================ @@ -4266,12 +4475,14 @@ void bssn_class::Step(int lev, int YN) << ", lev = " << lev << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - } -#endif - - // for black hole position - if (BH_num > 0 && lev == GH->levels - 1) - { + } +#endif + STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync); + + STEP_TIMER_DECL(timer_bh_predictor); + // for black hole position + if (BH_num > 0 && lev == GH->levels - 1) + { compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev); for (int ithBH = 0; ithBH < BH_num; ithBH++) { @@ -4303,10 +4514,11 @@ void bssn_class::Step(int lev, int YN) } // data analysis part // Warning NOTE: the variables1 are used as temp storege room - if (lev == a_lev) - { - AnalysisStuff(lev, dT_lev); - } + if (lev == a_lev) + { + AnalysisStuff(lev, dT_lev); + } + STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor); // corrector for (iter_count = 1; iter_count < 3; iter_count++) { @@ -5768,18 +5980,19 @@ void bssn_class::SHStep() // 0: do not use mixing two levels data for OutBD; 1: do use #define MIXOUTB 0 -void bssn_class::RestrictProlong(int lev, int YN, bool BB, - MyList *SL, MyList *OL, MyList *corL) -// we assume -// StateList 1 ----------- -// -// OldStateList 0 ----------- -// -// SynchList_cor old ----------- -{ -#if (PSTR == 1 || PSTR == 2) -// stringstream a_stream; -// a_stream.setf(ios::left); +void bssn_class::RestrictProlong(int lev, int YN, bool BB, + MyList *SL, MyList *OL, MyList *corL) +// we assume +// StateList 1 ----------- +// +// OldStateList 0 ----------- +// +// SynchList_cor old ----------- +{ + STEP_TIMER_DECL(timer_restrict_prolong); +#if (PSTR == 1 || PSTR == 2) +// stringstream a_stream; +// a_stream.setf(ios::left); #endif if (lev > 0) @@ -5912,14 +6125,15 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); -#if (PSTR == 1 || PSTR == 2) -// a_stream.clear(); -// a_stream.str(""); -// a_stream<mylev<<": after Sync"; -// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); -#endif - } -} +#if (PSTR == 1 || PSTR == 2) +// a_stream.clear(); +// a_stream.str(""); +// a_stream<mylev<<": after Sync"; +// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str()); +#endif + } + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); +} //================================================================================================ @@ -5929,16 +6143,17 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, // auxiliary operation, input lev means original lev-1 -void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, - MyList *SL, MyList *OL, MyList *corL) -// we assume -// StateList 1 ----------- -// -// OldStateList 0 ----------- -// -// SynchList_cor old ----------- -{ - // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux"); +void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, + MyList *SL, MyList *OL, MyList *corL) +// we assume +// StateList 1 ----------- +// +// OldStateList 0 ----------- +// +// SynchList_cor old ----------- +{ + STEP_TIMER_DECL(timer_restrict_prolong); + // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux"); if (lev >= GH->levels - 1) return; @@ -6003,10 +6218,11 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], SL, SL, GH->bdsul[lev], Symmetry); #endif } - - Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); - } -} + + Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]); + } + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); +} //================================================================================================ @@ -6014,9 +6230,10 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB, //================================================================================================ -void bssn_class::RestrictProlong(int lev, int YN, bool BB) -{ - double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); +void bssn_class::RestrictProlong(int lev, int YN, bool BB) +{ + STEP_TIMER_DECL(timer_restrict_prolong); + double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); // we assume for fine // SynchList_cor 1 ----------- // @@ -6092,10 +6309,11 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB) Parallel::OutBdLow2Hi_bam(GH->PatL[lev - 1], GH->PatL[lev], StateList, SynchList_cor, GH->bdsul[lev], Symmetry); #endif } - - Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); - } -} + + Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]); + } + STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); +} //================================================================================================ @@ -6843,60 +7061,52 @@ void bssn_class::compute_Porg_rhs(double **BH_PS,double **BH_RHS,var *forx,var * // new code considering diferent levels for different black hole -void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int ilev) -{ - const int InList = 3; - - MyList *DG_List = new MyList(forx); - DG_List->insert(fory); - DG_List->insert(forz); - - double *x1, *y1, *z1; - double *shellf; - shellf = new double[3]; - double *pox[3]; - for (int i = 0; i < 3; i++) - pox[i] = new double[1]; - - for (int n = 0; n < BH_num; n++) - { +void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int ilev) +{ + MyList DG_List_x(forx); + MyList DG_List_y(fory); + MyList DG_List_z(forz); + DG_List_x.next = &DG_List_y; + DG_List_y.next = &DG_List_z; + + double shellf[3]; + double pox_buf[3][1]; + double *pox[3] = {pox_buf[0], pox_buf[1], pox_buf[2]}; + + for (int n = 0; n < BH_num; n++) + { pox[0][0] = BH_PS[n][0]; pox[1][0] = BH_PS[n][1]; pox[2][0] = BH_PS[n][2]; int lev = ilev; - -#if (PSTR == 0) - while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry)) -#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) - while (!Parallel::PatList_Interp_Points(GH->PatL[lev], DG_List, 1, pox, shellf, Symmetry, GH->Commlev[lev])) -#endif - { - lev--; - if (lev < 0) - { - ErrorMonitor->outfile << "fail to find black holes at t = " << PhysTime << endl; - for (n = 0; n < BH_num; n++) - ErrorMonitor->outfile << "(x,y,z) = (" - << pox[0][n] << "," << pox[1][n] << "," << pox[2][n] - << ")" << endl; - break; - } + +#if (PSTR == 0) + while (!Parallel::PatList_Interp_Points(GH->PatL[lev], &DG_List_x, 1, pox, shellf, Symmetry)) +#elif (PSTR == 1 || PSTR == 2 || PSTR == 3) + while (!Parallel::PatList_Interp_Points(GH->PatL[lev], &DG_List_x, 1, pox, shellf, Symmetry, GH->Commlev[lev])) +#endif + { + lev--; + if (lev < 0) + { + ErrorMonitor->outfile << "fail to find black holes at t = " << PhysTime << endl; + for (n = 0; n < BH_num; n++) + ErrorMonitor->outfile << "(x,y,z) = (" + << BH_PS[n][0] << "," << BH_PS[n][1] << "," << BH_PS[n][2] + << ")" << endl; + break; + } } if (lev >= 0) { BH_RHS[n][0] = -shellf[0]; BH_RHS[n][1] = -shellf[1]; - BH_RHS[n][2] = -shellf[2]; - } - } - - DG_List->clearList(); - delete[] shellf; - for (int i = 0; i < 3; i++) - delete[] pox[i]; -} + BH_RHS[n][2] = -shellf[2]; + } + } +} #endif //================================================================================================ diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index c067902..a2536cb 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -134,9 +134,9 @@ public: Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong - monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; - monitor *ConVMonitor; - surface_integral *Waveshell; + monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor; + monitor *ConVMonitor, *TimingMonitor; + surface_integral *Waveshell; checkpoint *CheckPoint; public: diff --git a/AMSS_NCKU_source/macrodef.h b/AMSS_NCKU_source/macrodef.h index 631f33b..d529b3a 100644 --- a/AMSS_NCKU_source/macrodef.h +++ b/AMSS_NCKU_source/macrodef.h @@ -29,6 +29,12 @@ #define REGLEV 0 +#define BSSN_FINE_TIMING 1 + +#define BSSN_FINE_TIMING_EVERY 1 + +#define BSSN_FINE_TIMING_TOPN 8 + //#define USE_GPU //#define CHECKDETAIL @@ -88,6 +94,15 @@ // 0: for every level; // 1: for all // +// define BSSN_FINE_TIMING +// enable fine-grained per-timestep timing monitor +// +// define BSSN_FINE_TIMING_EVERY +// report timing every N coarse timesteps +// +// define BSSN_FINE_TIMING_TOPN +// number of hottest timing buckets shown in stdout +// // define USE_GPU // use gpu or not // diff --git a/generate_macrodef.py b/generate_macrodef.py index 40c76e9..e422f8f 100755 --- a/generate_macrodef.py +++ b/generate_macrodef.py @@ -144,6 +144,34 @@ def generate_macrodef_h(): print( "#define REGLEV 0", file=file1 ) print( file=file1 ) + # Define fine-grained timestep timing macros + # These default to enabled profiling without requiring AMSS_NCKU_Input.py edits. + + fine_timing = getattr(input_data, "Fine_Timing", + getattr(input_data, "Finegrained_Timing", "yes")) + timing_report_every = max(1, int(getattr( + input_data, "Timing_Every_Steps", + getattr(input_data, "Timing_Report_Every", 1)))) + timing_top_hotspots = max(1, int(getattr( + input_data, "Timing_Top_Hotspots", 8))) + + if ( fine_timing == "yes" ): + print( "#define BSSN_FINE_TIMING 1", file=file1 ) + print( file=file1 ) + elif ( fine_timing == "no" ): + print( "#define BSSN_FINE_TIMING 0", file=file1 ) + print( file=file1 ) + else: + print( "Fine_Timing setting error!!!" ) + print() + print( "# Fine_Timing setting error!!!", file=file1 ) + print( file=file1 ) + + print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 ) + print( file=file1 ) + print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 ) + print( file=file1 ) + # Define macro USE_GPU # use GPU or not @@ -224,6 +252,15 @@ def generate_macrodef_h(): print( "// 0: for every level;", file=file1 ) print( "// 1: for all", file=file1 ) print( "//", file=file1 ) + print( "// define BSSN_FINE_TIMING", file=file1 ) + print( "// enable fine-grained per-timestep timing monitor", file=file1 ) + print( "//", file=file1 ) + print( "// define BSSN_FINE_TIMING_EVERY", file=file1 ) + print( "// report timing every N coarse timesteps", file=file1 ) + print( "//", file=file1 ) + print( "// define BSSN_FINE_TIMING_TOPN", file=file1 ) + print( "// number of hottest timing buckets shown in stdout", file=file1 ) + print( "//", file=file1 ) print( "// define USE_GPU", file=file1 ) print( "// use gpu or not", file=file1 ) print( "//", file=file1 )