Add fine-grained step timing and trim BH RHS overhead

(cherry picked from commit 968522995b)
This commit is contained in:
2026-04-13 14:50:55 +08:00
committed by ianchb
parent 11977eb82f
commit 6410c62e3e
4 changed files with 223 additions and 3 deletions

View File

@@ -51,6 +51,112 @@ using namespace std;
#define BSSN_ENABLE_MEM_USAGE_LOG 0
#endif
#ifndef BSSN_FINE_TIMING
#define BSSN_FINE_TIMING 0
#endif
#ifndef BSSN_FINE_TIMING_EVERY
#define BSSN_FINE_TIMING_EVERY 1
#endif
#ifndef BSSN_FINE_TIMING_TOPN
#define BSSN_FINE_TIMING_TOPN 8
#endif
#if BSSN_FINE_TIMING
namespace step_timing
{
enum Bucket
{
TB_ANALYSIS_PSI4 = 0,
TB_ANALYSIS_SURFACE,
TB_ANALYSIS_IO,
TB_BH_PREDICTOR,
TB_PREDICTOR_RHS,
TB_PREDICTOR_SYNC,
TB_BH_CORRECTOR,
TB_CORRECTOR_RHS,
TB_CORRECTOR_SYNC,
TB_STATE_SWAP,
TB_RESTRICT_PROLONG,
TB_CONSTRAINT_OUT,
TB_DUMP_3D,
TB_DUMP_2D,
TB_CHECKPOINT,
TB_REGRID,
TB_COUNT
};
static double local_bucket_seconds[TB_COUNT];
static const char *bucket_labels[TB_COUNT] =
{
"analysis_psi4",
"analysis_surface",
"analysis_io",
"bh_predictor",
"predictor_rhs",
"predictor_sync",
"bh_corrector",
"corrector_rhs",
"corrector_sync",
"state_swap",
"restrict_prolong",
"constraint_out",
"dump_3d",
"dump_2d",
"checkpoint",
"regrid"
};
void reset()
{
for (int i = 0; i < TB_COUNT; i++)
local_bucket_seconds[i] = 0.0;
}
void add(Bucket bucket, double seconds)
{
local_bucket_seconds[int(bucket)] += seconds;
}
void report(int myrank, int nprocs, monitor *TimingMonitor,
int step_index, double phys_time, double step_wall_seconds)
{
double max_bucket_seconds[TB_COUNT];
double avg_bucket_seconds[TB_COUNT];
MPI_Reduce(local_bucket_seconds, max_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
MPI_Reduce(local_bucket_seconds, avg_bucket_seconds, TB_COUNT, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
if (myrank != 0)
return;
for (int i = 0; i < TB_COUNT; i++)
avg_bucket_seconds[i] /= Mymax(1, nprocs);
if (TimingMonitor)
{
double row[2 + 2 * TB_COUNT];
row[0] = double(step_index);
row[1] = step_wall_seconds;
for (int i = 0; i < TB_COUNT; i++)
{
row[2 + i] = max_bucket_seconds[i];
row[2 + TB_COUNT + i] = avg_bucket_seconds[i];
}
TimingMonitor->writefile(phys_time, 2 + 2 * TB_COUNT, row);
}
}
}
#define STEP_TIMER_DECL(var_name) const double var_name = MPI_Wtime()
#define STEP_TIMER_ADD(bucket_name, var_name) step_timing::add(step_timing::bucket_name, MPI_Wtime() - (var_name))
#else
#define STEP_TIMER_DECL(var_name)
#define STEP_TIMER_ADD(bucket_name, var_name)
#endif
#if USE_CUDA_BSSN
namespace {
@@ -420,6 +526,24 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei,
a_stream.str("");
a_stream << setw(15) << "# time Ham Px Py Pz Gx Gy Gz";
ConVMonitor = new monitor("bssn_constraint.dat", myrank, a_stream.str());
#if BSSN_FINE_TIMING
a_stream.clear();
a_stream.str("");
a_stream << setw(8) << "# step";
a_stream << setw(14) << "wall";
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
a_stream << setw(18) << step_timing::bucket_labels[ib];
for (int ib = 0; ib < step_timing::TB_COUNT; ib++)
{
char str_avg[64];
sprintf(str_avg, "avg_%s", step_timing::bucket_labels[ib]);
a_stream << setw(18) << str_avg;
}
TimingMonitor = new monitor("bssn_step_timing.dat", myrank, a_stream.str());
#else
TimingMonitor = 0;
#endif
}
// setup sphere integration engine
Waveshell = new surface_integral(Symmetry);
@@ -1368,6 +1492,7 @@ bssn_class::~bssn_class()
delete BHMonitor;
delete MAPMonitor;
delete ConVMonitor;
delete TimingMonitor;
delete Waveshell;
delete CheckPoint;
@@ -2466,6 +2591,10 @@ void bssn_class::Evolve(int Steps)
for (int ncount = 1; ncount < Steps + 1; ncount++)
{
cuda_level0_constraint_cache_valid = false;
#if BSSN_FINE_TIMING
step_timing::reset();
STEP_TIMER_DECL(step_wall_start);
#endif
// special for large mass ratio consideration
// if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6)
@@ -2484,7 +2613,9 @@ void bssn_class::Evolve(int Steps)
// misc::tillherecheck("before Constraint_Out");
STEP_TIMER_DECL(timer_constraint_out);
Constraint_Out(); // this will affect the Dump_List
STEP_TIMER_ADD(TB_CONSTRAINT_OUT, timer_constraint_out);
LastDump += dT_mon;
Last2dDump += dT_mon;
@@ -2493,6 +2624,7 @@ void bssn_class::Evolve(int Steps)
// When LastDump >= DumpTime, output corresponding binary data
if (LastDump >= DumpTime)
{
STEP_TIMER_DECL(timer_dump3d);
// misc::tillherecheck("before Dump_Data");
for (int lev = 0; lev < GH->levels; lev++)
@@ -2500,6 +2632,7 @@ void bssn_class::Evolve(int Steps)
#ifdef WithShell
SH->Dump_Data(DumpList, 0, PhysTime, dT_mon);
#endif
STEP_TIMER_ADD(TB_DUMP_3D, timer_dump3d);
LastDump = 0;
@@ -2512,10 +2645,12 @@ void bssn_class::Evolve(int Steps)
// When Last2dDump >= d2DumpTime, output corresponding 2D data
if (Last2dDump >= d2DumpTime)
{
STEP_TIMER_DECL(timer_dump2d);
// misc::tillherecheck("before 2dDump_Data");
for (int lev = 0; lev < GH->levels; lev++)
Parallel::d2Dump_Data(GH->PatL[lev], DumpList, 0, PhysTime, dT_mon);
STEP_TIMER_ADD(TB_DUMP_2D, timer_dump2d);
Last2dDump = 0;
@@ -2540,10 +2675,12 @@ void bssn_class::Evolve(int Steps)
break;
#if (REGLEV == 1)
STEP_TIMER_DECL(timer_regrid);
GH->Regrid(Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_mon, StartTime, dT_mon / 2), ErrorMonitor);
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
STEP_TIMER_ADD(TB_REGRID, timer_regrid);
#endif
#if (REGLEV == 0 && (PSTR == 1 || PSTR == 2))
@@ -2618,6 +2755,7 @@ void bssn_class::Evolve(int Steps)
// When LastCheck >= CheckTime, perform runtime checks and output status data
if (LastCheck >= CheckTime)
{
STEP_TIMER_DECL(timer_checkpoint);
LastCheck = 0;
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
@@ -2626,7 +2764,13 @@ void bssn_class::Evolve(int Steps)
CheckPoint->writecheck_sh(PhysTime, SH);
#endif
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint);
}
#if BSSN_FINE_TIMING
if (ncount % BSSN_FINE_TIMING_EVERY == 0)
step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, MPI_Wtime() - step_wall_start);
#endif
}
/*
#ifdef With_AHF
@@ -2758,6 +2902,7 @@ void bssn_class::RecursiveStep(int lev)
#endif
#if (REGLEV == 0)
STEP_TIMER_DECL(timer_regrid_onelevel);
if (GH->Regrid_Onelevel(lev, Symmetry, BH_num, Porgbr, Porg0,
SynchList_cor, OldStateList, StateList, SynchList_pre,
fgt(PhysTime - dT_lev, StartTime, dT_lev / 2), ErrorMonitor))
@@ -2766,6 +2911,7 @@ void bssn_class::RecursiveStep(int lev)
ConstraintRefreshLevels[lev] = 1;
for (int il = 0; il < GH->levels; il++) { sync_cache_pre[il].invalidate(); sync_cache_cor[il].invalidate(); sync_cache_rp_coarse[il].invalidate(); sync_cache_rp_fine[il].invalidate(); sync_cache_restrict[il].invalidate(); sync_cache_outbd[il].invalidate(); }
}
STEP_TIMER_ADD(TB_REGRID, timer_regrid_onelevel);
#endif
}
@@ -3365,6 +3511,7 @@ void bssn_class::Step(int lev, int YN)
// new code 2013-2-15, zjcao
#if (MAPBH == 1)
STEP_TIMER_DECL(timer_bh_predictor);
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -3402,6 +3549,7 @@ void bssn_class::Step(int lev, int YN)
{
AnalysisStuff(lev, dT_lev);
}
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
#endif
#ifdef With_AHF
@@ -3418,6 +3566,7 @@ void bssn_class::Step(int lev, int YN)
MyList<ss_patch> *sPp;
// Predictor
STEP_TIMER_DECL(timer_predictor_rhs);
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -3759,6 +3908,7 @@ void bssn_class::Step(int lev, int YN)
}
#endif
}
STEP_TIMER_ADD(TB_PREDICTOR_RHS, timer_predictor_rhs);
// Non-blocking error reduction overlapped with Sync to hide Allreduce latency
MPI_Request err_req;
@@ -3768,6 +3918,7 @@ void bssn_class::Step(int lev, int YN)
}
#endif
STEP_TIMER_DECL(timer_predictor_sync);
Parallel::AsyncSyncState async_pre;
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
@@ -3789,6 +3940,7 @@ void bssn_class::Step(int lev, int YN)
}
#endif
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry);
STEP_TIMER_ADD(TB_PREDICTOR_SYNC, timer_predictor_sync);
#if USE_CUDA_BSSN
const bool need_analysis_state_after_predictor =
@@ -3856,6 +4008,7 @@ void bssn_class::Step(int lev, int YN)
// corrector
for (iter_count = 1; iter_count < 4; iter_count++)
{
STEP_TIMER_DECL(timer_corrector_rhs);
// for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
@@ -4198,7 +4351,9 @@ void bssn_class::Step(int lev, int YN)
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
}
#endif
STEP_TIMER_ADD(TB_CORRECTOR_RHS, timer_corrector_rhs);
STEP_TIMER_DECL(timer_corrector_sync);
Parallel::AsyncSyncState async_cor;
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
@@ -4220,6 +4375,7 @@ void bssn_class::Step(int lev, int YN)
}
#endif
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry);
STEP_TIMER_ADD(TB_CORRECTOR_SYNC, timer_corrector_sync);
#ifdef WithShell
// Complete non-blocking error reduction and check
@@ -4240,6 +4396,7 @@ void bssn_class::Step(int lev, int YN)
#endif
#if (MAPBH == 0)
STEP_TIMER_DECL(timer_bh_corrector);
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -4272,11 +4429,13 @@ void bssn_class::Step(int lev, int YN)
}
}
}
STEP_TIMER_ADD(TB_BH_CORRECTOR, timer_bh_corrector);
#endif
// swap time level
if (iter_count < 3)
{
STEP_TIMER_DECL(timer_state_swap);
Pp = GH->PatL[lev];
while (Pp)
{
@@ -4323,6 +4482,7 @@ void bssn_class::Step(int lev, int YN)
}
}
#endif
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_swap);
}
}
#if USE_CUDA_BSSN
@@ -4358,6 +4518,7 @@ void bssn_class::Step(int lev, int YN)
//
// OldStateList old -----------
// update
STEP_TIMER_DECL(timer_state_commit);
Pp = GH->PatL[lev];
while (Pp)
{
@@ -4404,6 +4565,7 @@ void bssn_class::Step(int lev, int YN)
#endif
}
#endif
STEP_TIMER_ADD(TB_STATE_SWAP, timer_state_commit);
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -6255,6 +6417,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
//
// SynchList_cor old -----------
{
STEP_TIMER_DECL(timer_restrict_prolong);
#if (PSTR == 1 || PSTR == 2)
// stringstream a_stream;
// a_stream.setf(ios::left);
@@ -6401,6 +6564,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB,
// misc::tillherecheck(GH->Commlev[GH->mylev],GH->start_rank[GH->mylev],a_stream.str());
#endif
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
//================================================================================================
@@ -6420,6 +6584,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
//
// SynchList_cor old -----------
{
STEP_TIMER_DECL(timer_restrict_prolong);
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"starting RestrictProlong_aux");
if (lev >= GH->levels - 1)
@@ -6492,6 +6657,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
Parallel::Sync_cached(GH->PatL[lev], SL, Symmetry, sync_cache_rp_fine[lev]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
//================================================================================================
@@ -6502,6 +6668,7 @@ void bssn_class::RestrictProlong_aux(int lev, int YN, bool BB,
void bssn_class::RestrictProlong(int lev, int YN, bool BB)
{
STEP_TIMER_DECL(timer_restrict_prolong);
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
// we assume for fine
// SynchList_cor 1 -----------
@@ -6585,6 +6752,7 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB)
Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_rp_fine[lev]);
}
STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong);
}
//================================================================================================

View File

@@ -135,9 +135,9 @@ public:
Parallel::SyncCache *sync_cache_restrict; // cached Restrict in RestrictProlong
Parallel::SyncCache *sync_cache_outbd; // cached OutBdLow2Hi in RestrictProlong
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor;
surface_integral *Waveshell;
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor, *TimingMonitor;
surface_integral *Waveshell;
checkpoint *CheckPoint;
public:

View File

@@ -29,6 +29,12 @@
#define REGLEV 0
#define BSSN_FINE_TIMING 1
#define BSSN_FINE_TIMING_EVERY 1
#define BSSN_FINE_TIMING_TOPN 8
//#define USE_GPU
//#define CHECKDETAIL
@@ -88,6 +94,15 @@
// 0: for every level;
// 1: for all
//
// define BSSN_FINE_TIMING
// enable fine-grained per-timestep timing monitor
//
// define BSSN_FINE_TIMING_EVERY
// report timing every N coarse timesteps
//
// define BSSN_FINE_TIMING_TOPN
// number of hottest timing buckets shown in stdout
//
// define USE_GPU
// use gpu or not
//

View File

@@ -144,6 +144,34 @@ def generate_macrodef_h():
print( "#define REGLEV 0", file=file1 )
print( file=file1 )
# Define fine-grained timestep timing macros
# These default to enabled profiling without requiring AMSS_NCKU_Input.py edits.
fine_timing = getattr(input_data, "Fine_Timing",
getattr(input_data, "Finegrained_Timing", "yes"))
timing_report_every = max(1, int(getattr(
input_data, "Timing_Every_Steps",
getattr(input_data, "Timing_Report_Every", 1))))
timing_top_hotspots = max(1, int(getattr(
input_data, "Timing_Top_Hotspots", 8)))
if ( fine_timing == "yes" ):
print( "#define BSSN_FINE_TIMING 1", file=file1 )
print( file=file1 )
elif ( fine_timing == "no" ):
print( "#define BSSN_FINE_TIMING 0", file=file1 )
print( file=file1 )
else:
print( "Fine_Timing setting error!!!" )
print()
print( "# Fine_Timing setting error!!!", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_EVERY {timing_report_every}", file=file1 )
print( file=file1 )
print( f"#define BSSN_FINE_TIMING_TOPN {timing_top_hotspots}", file=file1 )
print( file=file1 )
# Define macro USE_GPU
# use GPU or not
@@ -224,6 +252,15 @@ def generate_macrodef_h():
print( "// 0: for every level;", file=file1 )
print( "// 1: for all", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING", file=file1 )
print( "// enable fine-grained per-timestep timing monitor", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_EVERY", file=file1 )
print( "// report timing every N coarse timesteps", file=file1 )
print( "//", file=file1 )
print( "// define BSSN_FINE_TIMING_TOPN", file=file1 )
print( "// number of hottest timing buckets shown in stdout", file=file1 )
print( "//", file=file1 )
print( "// define USE_GPU", file=file1 )
print( "// use gpu or not", file=file1 )
print( "//", file=file1 )