diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index d134bd4..4b9cc65 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -63,6 +63,14 @@ using namespace std; #define BSSN_FINE_TIMING_TOPN 8 #endif +#ifndef BSSN_KERNEL_FINE_TIMING +#define BSSN_KERNEL_FINE_TIMING 0 +#endif + +#ifndef BSSN_ENABLE_STDIN_ABORT_POLL +#define BSSN_ENABLE_STDIN_ABORT_POLL 0 +#endif + #if BSSN_FINE_TIMING namespace step_timing { @@ -157,6 +165,74 @@ namespace step_timing #define STEP_TIMER_ADD(bucket_name, var_name) #endif +#if BSSN_KERNEL_FINE_TIMING +namespace rhs_kernel_timing_report +{ + void report(int myrank, int nprocs, int step_index, double step_wall_seconds) + { + const int bucket_count = f_bssn_rhs_kernel_timing_bucket_count(); + const double *local_bucket_seconds = f_bssn_rhs_kernel_timing_local_seconds(); + + if (bucket_count <= 0 || !local_bucket_seconds) + return; + + double *max_bucket_seconds = new double[bucket_count]; + double *avg_bucket_seconds = new double[bucket_count]; + int *order = new int[bucket_count]; + + MPI_Reduce((void *)local_bucket_seconds, max_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce((void *)local_bucket_seconds, avg_bucket_seconds, bucket_count, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if (myrank == 0) + { + double kernel_total = 0.0; + for (int i = 0; i < bucket_count; ++i) + { + avg_bucket_seconds[i] /= Mymax(1, nprocs); + order[i] = i; + kernel_total += max_bucket_seconds[i]; + } + + for (int i = 0; i < bucket_count - 1; ++i) + for (int j = i + 1; j < bucket_count; ++j) + if (max_bucket_seconds[order[j]] > max_bucket_seconds[order[i]]) + { + int tmp = order[i]; + order[i] = order[j]; + order[j] = tmp; + } + + ios::fmtflags old_flags = cout.flags(); + streamsize old_precision = cout.precision(); + + const double kernel_frac = (step_wall_seconds > 0.0) ? (100.0 * kernel_total / step_wall_seconds) : 0.0; + cout << " RHS kernel split (max-rank accumulated over step " << step_index << "): total " + << setprecision(6) << kernel_total << " s (" << setprecision(4) + << kernel_frac << "% of coarse step)" << endl; + + const int topn = Mymin(BSSN_FINE_TIMING_TOPN, bucket_count); + for (int i = 0; i < topn; ++i) + { + const int ib = order[i]; + const double frac = (kernel_total > 0.0) ? (100.0 * max_bucket_seconds[ib] / kernel_total) : 0.0; + cout << " " + << setw(20) << left << f_bssn_rhs_kernel_timing_label(ib) + << " = " << setw(10) << right << setprecision(6) << max_bucket_seconds[ib] + << " s (" << setw(6) << setprecision(4) << frac << "% of kernel)" << endl; + } + cout << endl; + + cout.flags(old_flags); + cout.precision(old_precision); + } + + delete[] max_bucket_seconds; + delete[] avg_bucket_seconds; + delete[] order; + } +} +#endif + #if USE_CUDA_BSSN namespace { @@ -2593,7 +2669,12 @@ void bssn_class::Evolve(int Steps) cuda_level0_constraint_cache_valid = false; #if BSSN_FINE_TIMING step_timing::reset(); - STEP_TIMER_DECL(step_wall_start); +#endif +#if BSSN_KERNEL_FINE_TIMING + f_bssn_rhs_kernel_timing_reset(); +#endif +#if (BSSN_FINE_TIMING || BSSN_KERNEL_FINE_TIMING) + const double step_wall_start = MPI_Wtime(); #endif // special for large mass ratio consideration @@ -2720,16 +2801,18 @@ void bssn_class::Evolve(int Steps) << endl; } cout << endl; +#if BSSN_ENABLE_STDIN_ABORT_POLL cout << " If you think the physical evolution time is enough for this simulation, please input 'stop' in the terminal to stop the MPI processes in the next evolution step ! " << endl; +#endif // cout << endl; } - + +#if BSSN_ENABLE_STDIN_ABORT_POLL //////////////////////////////////////////////////////////// // If an "abort" command is detected on stdin, terminate MPI processes //////////////////////////////////////////////////////////// - + bool shouldAbort = false; - // Only rank 0 checks stdin if (myrank == 0) { @@ -2749,8 +2832,9 @@ void bssn_class::Evolve(int Steps) MPI_Abort(MPI_COMM_WORLD, 1); // terminate MPI processes // MPI_Finalize(); } - + //////////////////////////////////////////////////////////// +#endif // When LastCheck >= CheckTime, perform runtime checks and output status data if (LastCheck >= CheckTime) @@ -2770,6 +2854,10 @@ void bssn_class::Evolve(int Steps) #if BSSN_FINE_TIMING if (ncount % BSSN_FINE_TIMING_EVERY == 0) step_timing::report(myrank, nprocs, TimingMonitor, ncount, PhysTime, MPI_Wtime() - step_wall_start); +#endif +#if BSSN_KERNEL_FINE_TIMING + if (ncount % BSSN_FINE_TIMING_EVERY == 0) + rhs_kernel_timing_report::report(myrank, nprocs, ncount, MPI_Wtime() - step_wall_start); #endif } /* diff --git a/AMSS_NCKU_source/bssn_rhs.h b/AMSS_NCKU_source/bssn_rhs.h index 363420d..96a545a 100644 --- a/AMSS_NCKU_source/bssn_rhs.h +++ b/AMSS_NCKU_source/bssn_rhs.h @@ -22,19 +22,32 @@ #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS #define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR #endif -#ifdef fortran3 -#define f_compute_rhs_bssn compute_rhs_bssn_ +#ifdef fortran3 +#define f_compute_rhs_bssn compute_rhs_bssn_ #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_ #define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_ #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_ #define f_compute_rhs_Z4c compute_rhs_z4c_ #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_ #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_ -#define f_compute_constraint_fr compute_constraint_fr_ -#endif -extern "C" -{ - int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z +#define f_compute_constraint_fr compute_constraint_fr_ +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + void f_bssn_rhs_kernel_timing_reset(); + int f_bssn_rhs_kernel_timing_bucket_count(); + const double *f_bssn_rhs_kernel_timing_local_seconds(); + const char *f_bssn_rhs_kernel_timing_label(int); +#ifdef __cplusplus +} +#endif + +extern "C" +{ + int f_compute_rhs_bssn(int *, double &, double *, double *, double *, // ex,T,X,Y,Z double *, double *, // chi, trK double *, double *, double *, double *, double *, double *, // gij double *, double *, double *, double *, double *, double *, // Aij diff --git a/AMSS_NCKU_source/bssn_rhs_c.C b/AMSS_NCKU_source/bssn_rhs_c.C index 9d8c2ee..7edd0c3 100644 --- a/AMSS_NCKU_source/bssn_rhs_c.C +++ b/AMSS_NCKU_source/bssn_rhs_c.C @@ -2,12 +2,88 @@ #include "bssn_rhs.h" #include "share_func.h" #include "tool.h" +#include // 0-based i,j,k // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny)) // ex(1)=nx, ex(2)=ny, ex(3)=nz // 用法:a[ IDX_F(i,j,k,nx,ny) ] +#ifndef BSSN_KERNEL_FINE_TIMING +#define BSSN_KERNEL_FINE_TIMING 0 +#endif + +#if BSSN_KERNEL_FINE_TIMING +namespace rhs_kernel_timing +{ + enum Bucket + { + KB_SETUP_DERIVS = 0, + KB_GEOM_GAMMA, + KB_RICCI_METRIC, + KB_CHI_LAPSE, + KB_AIJ_TRK_GAUGE, + KB_KO_CONSTRAINT, + KB_COUNT + }; + + static double local_bucket_seconds[KB_COUNT]; + + static const char *bucket_labels[KB_COUNT] = + { + "setup_derivs", + "geom_gamma", + "ricci_metric", + "chi_lapse", + "aij_trk_gauge", + "ko_constraint" + }; + + static inline double now_seconds() + { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return double(ts.tv_sec) + 1.0e-9 * double(ts.tv_nsec); + } +} + +extern "C" void f_bssn_rhs_kernel_timing_reset() +{ + for (int i = 0; i < rhs_kernel_timing::KB_COUNT; ++i) + rhs_kernel_timing::local_bucket_seconds[i] = 0.0; +} + +extern "C" int f_bssn_rhs_kernel_timing_bucket_count() +{ + return rhs_kernel_timing::KB_COUNT; +} + +extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() +{ + return rhs_kernel_timing::local_bucket_seconds; +} + +extern "C" const char *f_bssn_rhs_kernel_timing_label(int bucket_index) +{ + if (bucket_index < 0 || bucket_index >= rhs_kernel_timing::KB_COUNT) + return "unknown"; + return rhs_kernel_timing::bucket_labels[bucket_index]; +} + +#define RHS_KERNEL_TIMER_DECL(var_name) const double var_name = rhs_kernel_timing::now_seconds() +#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) \ + rhs_kernel_timing::local_bucket_seconds[int(rhs_kernel_timing::bucket_name)] += \ + rhs_kernel_timing::now_seconds() - (var_name) +#else +extern "C" void f_bssn_rhs_kernel_timing_reset() {} +extern "C" int f_bssn_rhs_kernel_timing_bucket_count() { return 0; } +extern "C" const double *f_bssn_rhs_kernel_timing_local_seconds() { return 0; } +extern "C" const char *f_bssn_rhs_kernel_timing_label(int) { return "disabled"; } + +#define RHS_KERNEL_TIMER_DECL(var_name) +#define RHS_KERNEL_TIMER_ADD(bucket_name, var_name) +#endif + // C function that calculates the right-hand side for BSSN equations int f_compute_rhs_bssn(int *ex, double &T, double *X, double *Y, double *Z, @@ -102,6 +178,7 @@ int f_compute_rhs_bssn(int *ex, double &T, dY = Y[1] - Y[0]; dZ = Z[1] - Z[0]; + RHS_KERNEL_TIMER_DECL(timer_setup_derivs); // 1ms // for(int i=0;i 1) for(int i=0;i