diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 2614ac6..ac136dd 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -59,6 +59,14 @@ using namespace std; #define BSSN_FINE_TIMING_TOPN 8 #endif +#ifndef BSSN_KERNEL_FINE_TIMING +#define BSSN_KERNEL_FINE_TIMING 0 +#endif + +#ifndef BSSN_ENABLE_STDIN_ABORT_POLL +#define BSSN_ENABLE_STDIN_ABORT_POLL 0 +#endif + #if BSSN_FINE_TIMING namespace step_timing { @@ -198,6 +206,74 @@ namespace step_timing #define STEP_TIMER_ADD(bucket_name, var_name) #endif +#if BSSN_KERNEL_FINE_TIMING +namespace rhs_kernel_timing_report +{ + void report(int myrank, int nprocs, int step_index, double step_wall_seconds) + { + const int bucket_count = f_bssn_rhs_kernel_timing_bucket_count(); + const double *local_bucket_seconds = f_bssn_rhs_kernel_timing_local_seconds(); + + if (bucket_count <= 0 || !local_bucket_seconds) + return; + + double *max_bucket_seconds = new double[bucket_count]; + double *avg_bucket_seconds = new double[bucket_count]; + int *order = new int[bucket_count]; + + MPI_Reduce((void *)local_bucket_seconds, max_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce((void *)local_bucket_seconds, avg_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (myrank == 0) + { + double kernel_total = 0.0; + for (int i = 0; i < bucket_count; ++i) + { + avg_bucket_seconds[i] /= Mymax(1, nprocs); + order[i] = i; + kernel_total += max_bucket_seconds[i]; + } + + for (int i = 0; i < bucket_count - 1; ++i) + for (int j = i + 1; j < bucket_count; ++j) + if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]]) + { + int tmp = order[i]; + order[i] = order[j]; + order[j] = tmp; + } + + ios::fmtflags old_flags = cout.flags(); + streamsize old_precision = cout.precision(); + + const double kernel_frac = (step_wall_seconds > 0.0) ? (100.0 * kernel_total / step_wall_seconds) : 0.0; + cout << " RHS kernel split (max-rank accumulated over step " << step_index << "): total " + << setprecision(6) << kernel_total << " s (" << setprecision(4) + << kernel_frac << "% of coarse step)" << endl; + + const int topn = Mymin(BSSN_FINE_TIMING_TOPN, bucket_count); + for (int i = 0; i < topn; ++i) + { + const int ib = order[i]; + const double frac = (kernel_total > 0.0) ? (100.0 * max_bucket_seconds[ib] / kernel_total) : 0.0; + cout << " " + << setw(20) << left << f_bssn_rhs_kernel_timing_label(ib) + << " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib] + << " s (" << setw(6) << setprecision(4) << frac << "% of kernel)" << endl; + } + cout << endl; + + cout.flags(old_flags); + cout.precision(old_precision); + } + + delete[] max_bucket_seconds; + delete[] avg_bucket_seconds; + delete[] order; + } +} +#endif + //================================================================================================ // define bssn_class @@ -2325,7 +2401,12 @@ void bssn_class::Evolve(int Steps) { #if BSSN_FINE_TIMING step_timing::reset(); - STEP_TIMER_DECL(step_wall_start); +#endif +#if BSSN_KERNEL_FINE_TIMING + f_bssn_rhs_kernel_timing_reset(); +#endif +#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING) + const double step_wall_start = MPI_Wtime(); #endif // special for large mass ratio consideration // if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) @@ -2447,15 +2528,18 @@ void bssn_class::Evolve(int Steps) << Porg0[i_count][1] << " " << Porg0[i_count][2] << ")" << endl; - } - cout << endl; - cout << " If you think the physical evolution time is enough for this simulation, please input 'stop' in the terminal to stop the MPI processes in the next evolution step ! " << endl; - // cout << endl; - } - - //////////////////////////////////////////////////////////// - // If an "abort" command is detected on stdin, terminate MPI processes - //////////////////////////////////////////////////////////// + } + cout << endl; +#if BSSN_ENABLE_STDIN_ABORT_POLL + cout << " If you think the physical evolution time is enough for this simulation, please input 'stop' in the terminal to stop the MPI processes in the next evolution step ! " << endl; +#endif + // cout << endl; + } + +#if BSSN_ENABLE_STDIN_ABORT_POLL + //////////////////////////////////////////////////////////// + // If an "abort" command is detected on stdin, terminate MPI processes + //////////////////////////////////////////////////////////// bool shouldAbort = false; @@ -2477,11 +2561,12 @@ void bssn_class::Evolve(int Steps) cout << endl; MPI_Abort(MPI_COMM_WORLD, 1); // terminate MPI processes // MPI_Finalize(); - } - - //////////////////////////////////////////////////////////// - - // When LastCheck >= CheckTime, perform runtime checks and output status data + } + + //////////////////////////////////////////////////////////// +#endif + + // When LastCheck >= CheckTime, perform runtime checks and output status data if (LastCheck >= CheckTime) { STEP_TIMER_DECL(timer_checkpoint); @@ -2496,9 +2581,16 @@ void bssn_class::Evolve(int Steps) STEP_TIMER_ADD(TB_CHECKPOINT, timer_checkpoint); } +#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING) + const double step_wall_seconds = MPI_Wtime() - step_wall_start; +#endif #if BSSN_FINE_TIMING if (ncount % BSSN_FINE_TIMING_EVERY == 0) - step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, MPI_Wtime() - step_wall_start); + step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, step_wall_seconds); +#endif +#if BSSN_KERNEL_FINE_TIMING + if (ncount % BSSN_FINE_TIMING_EVERY == 0) + rhs_kernel_timing_report::report(myrank, nprocs, ncount, step_wall_seconds); #endif } /* @@ -3294,19 +3386,19 @@ void bssn_class::Step(int lev, int YN) { MyList *BP = Pp->data->blb; while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], - cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); -#endif - - if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + { + Block *cg = BP->data; + if (myrank == cg->rank) + { +#if (AGM == 0) + f_enforce_ga(cg->shape, + cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], + cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], + cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], + cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]); +#endif + + if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn], cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], @@ -3340,16 +3432,16 @@ void bssn_class::Step(int lev, int YN) cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], Symmetry, lev, ndeps, pre)) - { - cout << "find NaN in domain: (" - << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } - - // rk4 substep and boundary - { + { + cout << "find NaN in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + + // rk4 substep and boundary + { MyList *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; // we do not check the correspondence here while (varl0) { @@ -3402,9 +3494,9 @@ void bssn_class::Step(int lev, int YN) varl = varl->next; varlrhs = varlrhs->next; } - } - f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); - } + } + f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny); + } if (BP == Pp->data->ble) break; BP = BP->next; @@ -3657,26 +3749,26 @@ void bssn_class::Step(int lev, int YN) { MyList *BP = Pp->data->blb; while (BP) - { - Block *cg = BP->data; - if (myrank == cg->rank) - { -#if (AGM == 0) - f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#elif (AGM == 1) + { + Block *cg = BP->data; + if (myrank == cg->rank) + { +#if (AGM == 0) + f_enforce_ga(cg->shape, + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); +#elif (AGM == 1) if (iter_count == 3) f_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); -#endif - - if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], + cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], + cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], + cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], + cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]); +#endif + + if (f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2], cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn], cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], @@ -3710,15 +3802,15 @@ void bssn_class::Step(int lev, int YN) cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn], Symmetry, lev, ndeps, cor)) - { - cout << "find NaN in domain: (" - << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } - // rk4 substep and boundary - { + { + cout << "find NaN in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + // rk4 substep and boundary + { MyList *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; // we do not check the correspondence here while (varl0) { @@ -3770,9 +3862,9 @@ void bssn_class::Step(int lev, int YN) varl1 = varl1->next; varlrhs = varlrhs->next; } - } - f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); - } + } + f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny); + } if (BP == Pp->data->ble) break; BP = BP->next; diff --git a/AMSS_NCKU_source/bssn_rhs.h b/AMSS_NCKU_source/bssn_rhs.h index 363420d..96a545a 100644 --- a/AMSS_NCKU_source/bssn_rhs.h +++ b/AMSS_NCKU_source/bssn_rhs.h @@ -22,19 +22,32 @@ #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS #define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR #endif -#ifdef fortran3 -#define f_compute_rhs_bssn compute_rhs_bssn_ +#ifdef fortran3 +#define f_compute_rhs_bssn compute_rhs_bssn_ #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_ #define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_ #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_ #define f_compute_rhs_Z4c compute_rhs_z4c_ #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_ #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_ -#define f_compute_constraint_fr compute_constraint_fr_ -#endif -extern "C" -{ - int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z +#define f_compute_constraint_fr compute_constraint_fr_ +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + void f_bssn_rhs_kernel_timing_reset(); + int f_bssn_rhs_kernel_timing_bucket_count(); + const double *f_bssn_rhs_kernel_timing_local_seconds(); + const char *f_bssn_rhs_kernel_timing_label(int); +#ifdef __cplusplus +} +#endif + +extern "C" +{ + int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z double *, double *, // chi, trK double *, double *, double *, double *, double *, double *, // gij double *, double *, double *, double *, double *, double *, // Aij diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C index 837b013..949e5a0 100644 --- a/AMSS_NCKU_source/bssn_rhs_c.C +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -2,12 +2,88 @@ #include "bssn_rhs.h" #include "share_func.h" #include "tool.h" +#include // 0-based i,j,k // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) // ex(1)=nx, ex(2)=ny, ex(3)=nz // 用法:a[ IDX_F(i,j,k,nx,ny) ] +#ifndef BSSN_KERNEL_FINE_TIMING +#define BSSN_KERNEL_FINE_TIMING 0 +#endif + +#if BSSN_KERNEL_FINE_TIMING +namespace rhs_kernel_timing +{ + enum Bucket + { + KB_SETUP_DERIVS = 0, + KB_GEOM_GAMMA, + KB_RICCI_METRIC, + KB_CHI_LAPSE, + KB_AIJ_TRK_GAUGE, + KB_KO_CONSTRAINT, + KB_COUNT + }; + + static double local_bucket_seconds[KB_COUNT]; + + static const char *bucket_labels[KB_COUNT] = + { + "setup_derivs", + "geom_gamma", + "ricci_metric", + "chi_lapse", + "aij_trk_gauge", + "ko_constraint" + }; + + static inline double now_seconds() + { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec); + } +} + +extern "C" void f_bssn_rhs_kernel_timing_reset() +{ + for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i) + rhs_kernel_timing::local_bucket_seconds[i] = 0.0; +} + +extern "C" int f_bssn_rhs_kernel_timing_bucket_count() +{ + return rhs_kernel_timing::KB_COUNT; +} + +extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() +{ + return rhs_kernel_timing::local_bucket_seconds; +} + +extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index) +{ + if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT) + return "unknown"; + return rhs_kernel_timing::bucket_labels[bucket_index]; +} + +#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds() +#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \ + rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \ + rhs_kernel_timing::now_seconds() - (var_name) +#else +extern "C" void f_bssn_rhs_kernel_timing_reset() {} +extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; } +extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; } +extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; } + +#define RHS_KERNEL_TIMER_DECL(var_name) +#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) +#endif + // C function that calculates the right-hand side for BSSN equations int f_compute_rhs_bssn(int *ex, double &T, double *X, double *Y, double *Z, @@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T, dY = Y[1] - Y[0]; dZ = Z[1] - Z[0]; + RHS_KERNEL_TIMER_DECL(timer_setup_derivs); // 1ms // for(int i=0;i 1) for(int i=0;i