From da4d56ccf764008707d58eefc26502d79e8f012d Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 30 Apr 2026 18:25:21 +0800 Subject: [PATCH] Optimize BSSN surface interpolation fast path --- AMSS_NCKU_source/MPatch.C | 613 ++++++++++++++++++++++++---- AMSS_NCKU_source/bssn_class.C | 35 +- AMSS_NCKU_source/surface_integral.C | 54 ++- 3 files changed, 621 insertions(+), 81 deletions(-) diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 956e9c8..81ff61f 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -154,8 +154,8 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index) index.valid = true; } -int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) -{ +int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH) +{ if (!index.valid) return -1; @@ -175,10 +175,437 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co for (size_t bi = 0; bi < index.views.size(); bi++) if (point_in_block_view(index.views[bi], pox, DH)) return int(bi); - - return -1; -} -} // namespace + + return -1; +} + +inline int fortran_idint_local(double x) +{ + return int(x); +} + +bool interp_fast_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_INTERP_FAST"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool interp_fast_compare_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_INTERP_FAST_COMPARE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +double interp_fast_compare_tol() +{ + static double tol = -1.0; + if (tol < 0.0) + { + const char *env = getenv("AMSS_INTERP_FAST_COMPARE_TOL"); + tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-11; + } + return tol; +} + +long long interp_fast_compare_limit() +{ + static long long limit = -1; + if (limit < 0) + { + const char *env = getenv("AMSS_INTERP_FAST_COMPARE_LIMIT"); + limit = (env && atoll(env) > 0) ? atoll(env) : 4096; + } + return limit; +} + +struct FastInterpStencil +{ + int cxB[dim]; + double cx[dim]; + double wx[8]; + double wy[8]; + double wz[8]; + int nsamples; + int loc[512]; + unsigned char sign_mask[512]; + double weight[512]; +}; + +inline void lagrange_unit_weights(double x, int ordn, double *w) +{ + for (int i = 0; i < ordn; i++) + { + double num = 1.0; + double den = 1.0; + for (int j = 0; j < ordn; j++) + { + if (j == i) + continue; + num *= (x - double(j)); + den *= double(i - j); + } + w[i] = num / den; + } +} + +inline void z_unit_weights(double x, int ordn, double *w) +{ + if (ordn == 6) + { + static const double c_uniform[6] = {-1.0, 5.0, -10.0, 10.0, -5.0, 1.0}; + for (int i = 0; i < 6; i++) + { + if (x == double(i)) + { + for (int j = 0; j < 6; j++) + w[j] = (j == i) ? 1.0 : 0.0; + return; + } + } + double den = 0.0; + for (int i = 0; i < 6; i++) + { + w[i] = c_uniform[i] / (x - double(i)); + den += w[i]; + } + for (int i = 0; i < 6; i++) + w[i] /= den; + return; + } + lagrange_unit_weights(x, ordn, w); +} + +inline bool fast_interp_map_index(int idx, int extent, int d, + int &mapped, unsigned char &mask) +{ + if (idx > 0) + mapped = idx; + else + { + mask |= (unsigned char)(1u << d); +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + mapped = 2 - idx; +#else +#ifdef Cell + mapped = 1 - idx; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + return mapped >= 1 && mapped <= extent; +} + +bool prepare_fast_interp_stencil(Block *BP, const double *pox, int ordn, + int Symmetry, FastInterpStencil &st) +{ + if (!BP || ordn <= 0 || ordn > 8) + return false; + + st.nsamples = 0; + + const int NO_SYMM = 0; + const int OCTANT = 2; + int cmin[dim], cmax[dim], cxT[dim]; + for (int d = 0; d < dim; d++) + { + const double *X = BP->X[d]; + const double dX = X[1] - X[0]; + const int cxI = fortran_idint_local((pox[d] - X[0]) / dX + 0.4) + 1; + st.cxB[d] = cxI - ordn / 2 + 1; + cxT[d] = st.cxB[d] + ordn - 1; + cmin[d] = 1; + cmax[d] = BP->shape[d]; + +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX) + cmin[d] = -ordn / 2 + 2; + if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX) + cmin[d] = -ordn / 2 + 2; +#else +#ifdef Cell + if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX) + cmin[d] = -ordn / 2 + 1; + if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX) + cmin[d] = -ordn / 2 + 1; +#else +#error Not define Vertex nor Cell +#endif +#endif + + if (st.cxB[d] < cmin[d]) + { + st.cxB[d] = cmin[d]; + cxT[d] = st.cxB[d] + ordn - 1; + } + if (cxT[d] > cmax[d]) + { + cxT[d] = cmax[d]; + st.cxB[d] = cxT[d] + 1 - ordn; + } + + if (st.cxB[d] > 0) + st.cx[d] = (pox[d] - X[st.cxB[d] - 1]) / dX; + else + { +#ifdef Vertex +#ifdef Cell +#error Both Cell and Vertex are defined +#endif + st.cx[d] = (pox[d] + X[1 - st.cxB[d]]) / dX; +#else +#ifdef Cell + st.cx[d] = (pox[d] + X[-st.cxB[d]]) / dX; +#else +#error Not define Vertex nor Cell +#endif +#endif + } + } + + lagrange_unit_weights(st.cx[0], ordn, st.wx); + lagrange_unit_weights(st.cx[1], ordn, st.wy); + z_unit_weights(st.cx[2], ordn, st.wz); + + for (int kk = 0; kk < ordn; kk++) + { + for (int jj = 0; jj < ordn; jj++) + { + for (int ii = 0; ii < ordn; ii++) + { + unsigned char mask = 0; + int ix, iy, iz; + if (!fast_interp_map_index(st.cxB[0] + ii, BP->shape[0], 0, ix, mask) || + !fast_interp_map_index(st.cxB[1] + jj, BP->shape[1], 1, iy, mask) || + !fast_interp_map_index(st.cxB[2] + kk, BP->shape[2], 2, iz, mask)) + return false; + const int s = st.nsamples++; + st.loc[s] = (ix - 1) + (iy - 1) * BP->shape[0] + + (iz - 1) * BP->shape[0] * BP->shape[1]; + st.sign_mask[s] = mask; + st.weight[s] = st.wx[ii] * st.wy[jj] * st.wz[kk]; + } + } + } + return true; +} + +bool interpolate_var_list_with_stencil(Block *BP, MyList *VarList, + int num_var, const double *pox, + int ordn, int Symmetry, + const FastInterpStencil &st, + double *out) +{ + if (num_var <= 0 || num_var > 128) + return false; + + double *data_ptrs[128]; + double *soa_ptrs[128]; + var *vars[128]; + MyList *varl = VarList; + int k = 0; + while (varl) + { + if (k >= num_var) + return false; + vars[k] = varl->data; + data_ptrs[k] = BP->fgfs[vars[k]->sgfn]; + soa_ptrs[k] = vars[k]->SoA; + out[k] = 0.0; + varl = varl->next; + k++; + } + + if (k != num_var) + return false; + + for (int s = 0; s < st.nsamples; s++) + { + const int loc = st.loc[s]; + const double w = st.weight[s]; + const unsigned char mask = st.sign_mask[s]; + if (mask == 0) + { + for (int v = 0; v < num_var; v++) + out[v] += w * data_ptrs[v][loc]; + } + else + { + for (int v = 0; v < num_var; v++) + { + const double *SoA = soa_ptrs[v]; + double sgn = 1.0; + if (mask & 1u) + sgn *= SoA[0]; + if (mask & 2u) + sgn *= SoA[1]; + if (mask & 4u) + sgn *= SoA[2]; + out[v] += w * sgn * data_ptrs[v][loc]; + } + } + } + + if (interp_fast_compare_enabled()) + { + static int report_count = 0; + static long long compare_calls = 0; + if (compare_calls++ >= interp_fast_compare_limit()) + return true; + const double tol = interp_fast_compare_tol(); + varl = VarList; + k = 0; + while (varl) + { + var *vp = vars[k]; + double ref = 0.0; + double x = pox[0], y = pox[1], z = pox[2]; + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], + BP->fgfs[vp->sgfn], ref, + x, y, z, ordn, vp->SoA, Symmetry); + const double diff = fabs(ref - out[k]); + const double scale = 1.0 + fabs(ref); + if (diff > tol * scale && report_count < 32) + { + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + fprintf(stderr, + "[AMSS-INTERP-CMP][rank %d] var=%s diff=%.17e ref=%.17e fast=%.17e p=(%.17e,%.17e,%.17e)\n", + rank, vp->name, diff, ref, out[k], pox[0], pox[1], pox[2]); + report_count++; + } + varl = varl->next; + k++; + } + } + + return true; +} + +bool interpolate_var_list_fast(Block *BP, MyList *VarList, int num_var, + const double *pox, int ordn, int Symmetry, + double *out) +{ + if (!interp_fast_enabled()) + return false; + + FastInterpStencil st; + if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, st)) + return false; + + return interpolate_var_list_with_stencil(BP, VarList, num_var, pox, + ordn, Symmetry, st, out); +} + +struct CachedInterpPoint +{ + Block *bp; + int owner_rank; + FastInterpStencil stencil; +}; + +struct SurfaceInterpCache +{ + Patch *patch; + int NN; + int symmetry; + double key[9]; + vector points; + + SurfaceInterpCache() : patch(0), NN(0), symmetry(-1) {} +}; + +bool surface_cache_key_matches(const SurfaceInterpCache &cache, Patch *patch, + int NN, double **XX, int Symmetry) +{ + if (cache.patch != patch || cache.NN != NN || cache.symmetry != Symmetry || + int(cache.points.size()) != NN || NN <= 0) + return false; + const int mid = NN / 2; + const int last = NN - 1; + const int ids[3] = {0, mid, last}; + int p = 0; + for (int q = 0; q < 3; q++) + for (int d = 0; d < dim; d++) + if (cache.key[p++] != XX[d][ids[q]]) + return false; + return true; +} + +SurfaceInterpCache *find_surface_cache(Patch *patch, int NN, double **XX, + int Symmetry) +{ + static vector caches; + for (size_t i = 0; i < caches.size(); i++) + if (surface_cache_key_matches(caches[i], patch, NN, XX, Symmetry)) + return &caches[i]; + if (caches.size() >= 24) + caches.erase(caches.begin()); + caches.push_back(SurfaceInterpCache()); + return &caches.back(); +} + +bool build_surface_cache(SurfaceInterpCache &cache, Patch *patch, int NN, + double **XX, int Symmetry, const double *DH, + const BlockBinIndex &block_index, int ordn) +{ + int myrank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + cache.patch = patch; + cache.NN = NN; + cache.symmetry = Symmetry; + cache.points.clear(); + cache.points.resize(NN); + const int mid = NN / 2; + const int last = NN - 1; + const int ids[3] = {0, mid, last}; + int p = 0; + for (int q = 0; q < 3; q++) + for (int d = 0; d < dim; d++) + cache.key[p++] = XX[d][ids[q]]; + + for (int j = 0; j < NN; j++) + { + double pox[dim]; + for (int d = 0; d < dim; d++) + pox[d] = XX[d][j]; + const int block_i = find_block_index_for_point(block_index, pox, DH); + if (block_i < 0) + { + cache.points[j].bp = 0; + cache.points[j].owner_rank = -1; + continue; + } + Block *BP = block_index.views[block_i].bp; + cache.points[j].bp = BP; + cache.points[j].owner_rank = BP->rank; + cache.points[j].stencil.nsamples = 0; + if (BP->rank == myrank) + { + if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, + cache.points[j].stencil)) + return false; + } + } + return true; +} +} // namespace Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi) { @@ -561,22 +988,26 @@ void Patch::Interp_Points(MyList *VarList, if (block_i >= 0) { Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - //---> interpolation - varl = VarList; - int k = 0; - while (varl) // run along variables - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - } + owner_rank[j] = BP->rank; + if (myrank == BP->rank) + { + //---> interpolation + if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn, + Symmetry, Shellf + j * num_var)) + { + varl = VarList; + int k = 0; + while (varl) // run along variables + { + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); + varl = varl->next; + k++; + } + } + } + } + } // Replace MPI_Allreduce with per-owner MPI_Bcast: // Group consecutive points by owner rank and broadcast each group. @@ -659,10 +1090,8 @@ void Patch::Interp_Points(MyList *VarList, varl = varl->next; } - memset(Shellf, 0, sizeof(double) * NN * num_var); - - // owner_rank[j] records which MPI rank owns point j - int *owner_rank; + // owner_rank[j] records which MPI rank owns point j + int *owner_rank; owner_rank = new int[NN]; for (int j = 0; j < NN; j++) owner_rank[j] = -1; @@ -670,12 +1099,22 @@ void Patch::Interp_Points(MyList *VarList, double DH[dim]; for (int i = 0; i < dim; i++) DH[i] = getdX(i); - BlockBinIndex block_index; - build_block_bin_index(this, DH, block_index); - - // --- Interpolation phase (identical to original) --- - for (int j = 0; j < NN; j++) - { + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); + SurfaceInterpCache *surface_cache = 0; + bool use_surface_cache = false; + if (interp_fast_enabled()) + { + surface_cache = find_surface_cache(this, NN, XX, Symmetry); + use_surface_cache = surface_cache_key_matches(*surface_cache, this, NN, XX, Symmetry); + if (!use_surface_cache) + use_surface_cache = build_surface_cache(*surface_cache, this, NN, XX, + Symmetry, DH, block_index, ordn); + } + + // --- Interpolation phase (identical to original) --- + for (int j = 0; j < NN; j++) + { double pox[dim]; for (int i = 0; i < dim; i++) { @@ -692,28 +1131,58 @@ void Patch::Interp_Points(MyList *VarList, cout << ") is out of current Patch." << endl; } MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - varl = VarList; - int k = 0; - while (varl) - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - } + } + } + + if (use_surface_cache) + { + CachedInterpPoint &cp = surface_cache->points[j]; + Block *BP = cp.bp; + owner_rank[j] = cp.owner_rank; + if (BP && myrank == BP->rank) + { + if (!interpolate_var_list_with_stencil(BP, VarList, num_var, pox, + ordn, Symmetry, cp.stencil, + Shellf + j * num_var)) + { + MyList *varl_fallback = VarList; + int k = 0; + while (varl_fallback) + { + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry); + varl_fallback = varl_fallback->next; + k++; + } + } + } + } + else + { + const int block_i = find_block_index_for_point(block_index, pox, DH); + if (block_i >= 0) + { + Block *BP = block_index.views[block_i].bp; + owner_rank[j] = BP->rank; + if (myrank == BP->rank) + { + if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn, + Symmetry, Shellf + j * num_var)) + { + MyList *varl_fallback = VarList; + int k = 0; + while (varl_fallback) + { + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry); + varl_fallback = varl_fallback->next; + k++; + } + } + } + } + } + } #ifdef INTERP_LB_PROFILE double t_interp_end = MPI_Wtime(); @@ -965,22 +1434,26 @@ void Patch::Interp_Points(MyList *VarList, if (block_i >= 0) { Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - if (myrank == BP->rank) - { - //---> interpolation - varl = VarList; - int k = 0; - while (varl) // run along variables - { - f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], - pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); - varl = varl->next; - k++; - } - } - } - } + owner_rank[j] = BP->rank; + if (myrank == BP->rank) + { + //---> interpolation + if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn, + Symmetry, Shellf + j * num_var)) + { + varl = VarList; + int k = 0; + while (varl) // run along variables + { + f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k], + pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry); + varl = varl->next; + k++; + } + } + } + } + } // Collect unique global owner ranks and translate to local ranks in Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index fdd6fa7..79e924a 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -45,6 +45,20 @@ using namespace std; #include "derivatives.h" #include "ricci_gamma.h" +namespace +{ +bool amss_analysis_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ANALYSIS_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} +} + // Compile-time switch for per-timestep memory usage collection/printing. // Default is OFF to reduce overhead in production runs. #ifndef BSSN_ENABLE_MEM_USAGE_LOG @@ -7942,6 +7956,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) if (LastAnas >= AnasTime) { + const bool analysis_timing = amss_analysis_timing_enabled(); + const double t_analysis_start = analysis_timing ? MPI_Wtime() : 0.0; + double t_psi4_sec = 0.0; + double t_surface_sec = 0.0; #ifdef Point_Psi4 #error "not support parallel levels yet" // Gam_ijk and R_ij have been calculated in Interp_Constraint() @@ -8134,7 +8152,12 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) #endif } #else + { + const double t0 = analysis_timing ? MPI_Wtime() : 0.0; Compute_Psi4(lev); + if (analysis_timing) + t_psi4_sec += MPI_Wtime() - t0; + } #endif double *RP, *IP, *RoutMAP; int NN = 0; @@ -8150,7 +8173,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) bool shell_mass_prepared = false; #endif for (int i = 0; i < decn; i++) - { + { + const double t_surface0 = analysis_timing ? MPI_Wtime() : 0.0; #ifdef Point_Psi4 Waveshell->surf_Wave(Rex, GH, SH, phi, trK, @@ -8243,6 +8267,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) #endif #endif // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"end surface integral"); + if (analysis_timing) + t_surface_sec += MPI_Wtime() - t_surface0; #endif if (i == 0) { @@ -8275,6 +8301,13 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev) delete[] RP; delete[] IP; delete[] RoutMAP; + if (analysis_timing) + { + fprintf(stderr, + "[AMSS-ANALYSIS][rank %d] lev=%d psi4=%.6f surface=%.6f total_before_bh=%.6f detectors=%d modes=%d\n", + myrank, lev, t_psi4_sec, t_surface_sec, + MPI_Wtime() - t_analysis_start, decn, NN); + } // black hole's position { diff --git a/AMSS_NCKU_source/surface_integral.C b/AMSS_NCKU_source/surface_integral.C index 1e58dce..5de4369 100644 --- a/AMSS_NCKU_source/surface_integral.C +++ b/AMSS_NCKU_source/surface_integral.C @@ -8,10 +8,11 @@ #include #include #include -#include -#include -#include -using namespace std; +#include +#include +#include +#include +using namespace std; #else #include #include @@ -29,12 +30,26 @@ using namespace std; #include "fadmquantites_bssn.h" #include "getnpem2.h" #include "getnp4.h" -#include "parameters.h" - -#define PI M_PI -//|============================================================================ -//| Constructor -//|============================================================================ +#include "parameters.h" + +#define PI M_PI + +namespace +{ +bool amss_surface_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SURFACE_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} +} +//|============================================================================ +//| Constructor +//|============================================================================ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry), wave_cache_spinw(-1), @@ -3281,6 +3296,8 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH, var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs, double *Rout, monitor *Monitor, bool refresh_mass_fields) { + const bool timing = amss_surface_timing_enabled(); + const double t_start = timing ? MPI_Wtime() : 0.0; if (Symmetry != 0 && Symmetry != 1) { surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor); @@ -3325,6 +3342,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH, Pp = Pp->next; } } + const double t_refresh_done = timing ? MPI_Wtime() : 0.0; const int InList = 19; const int idx_rpsi4 = 0, idx_ipsi4 = 1; @@ -3380,6 +3398,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH, double *shellf = new double[n_tot * InList]; GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax); + const double t_interp_done = timing ? MPI_Wtime() : 0.0; double *RP_out = new double[NN]; double *IP_out = new double[NN]; @@ -3496,6 +3515,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH, if (Symmetry == 0) p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight; } + const double t_integral_done = timing ? MPI_Wtime() : 0.0; for (int ii = 0; ii < NN; ii++) { @@ -3534,6 +3554,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH, delete[] reduce_out; delete[] reduce_in; } + const double t_reduce_done = timing ? MPI_Wtime() : 0.0; #ifdef GaussInt mass = mass * rex * rex * dphi * factor; @@ -3565,6 +3586,19 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH, Rout[5] = sy; Rout[6] = sz; + if (timing) + { + fprintf(stderr, + "[AMSS-SURFACE][rank %d] rex=%.6g lev=%d refresh=%.6f interp=%.6f integral=%.6f reduce=%.6f total=%.6f nlocal=%d ntotal=%d modes=%d\n", + myrank, rex, lev, + t_refresh_done - t_start, + t_interp_done - t_refresh_done, + t_integral_done - t_interp_done, + t_reduce_done - t_integral_done, + t_reduce_done - t_start, + Nmax - Nmin + 1, n_tot, NN); + } + delete[] pox[0]; delete[] pox[1]; delete[] pox[2];