Optimize BSSN surface interpolation fast path

This commit is contained in:
2026-04-30 18:25:21 +08:00
parent a6483d013d
commit da4d56ccf7
3 changed files with 621 additions and 81 deletions

View File

@@ -178,6 +178,433 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
return -1;
}
inline int fortran_idint_local(double x)
{
return int(x);
}
bool interp_fast_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_INTERP_FAST");
enabled = (!env || atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool interp_fast_compare_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_INTERP_FAST_COMPARE");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
double interp_fast_compare_tol()
{
static double tol = -1.0;
if (tol < 0.0)
{
const char *env = getenv("AMSS_INTERP_FAST_COMPARE_TOL");
tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-11;
}
return tol;
}
long long interp_fast_compare_limit()
{
static long long limit = -1;
if (limit < 0)
{
const char *env = getenv("AMSS_INTERP_FAST_COMPARE_LIMIT");
limit = (env && atoll(env) > 0) ? atoll(env) : 4096;
}
return limit;
}
struct FastInterpStencil
{
int cxB[dim];
double cx[dim];
double wx[8];
double wy[8];
double wz[8];
int nsamples;
int loc[512];
unsigned char sign_mask[512];
double weight[512];
};
inline void lagrange_unit_weights(double x, int ordn, double *w)
{
for (int i = 0; i < ordn; i++)
{
double num = 1.0;
double den = 1.0;
for (int j = 0; j < ordn; j++)
{
if (j == i)
continue;
num *= (x - double(j));
den *= double(i - j);
}
w[i] = num / den;
}
}
inline void z_unit_weights(double x, int ordn, double *w)
{
if (ordn == 6)
{
static const double c_uniform[6] = {-1.0, 5.0, -10.0, 10.0, -5.0, 1.0};
for (int i = 0; i < 6; i++)
{
if (x == double(i))
{
for (int j = 0; j < 6; j++)
w[j] = (j == i) ? 1.0 : 0.0;
return;
}
}
double den = 0.0;
for (int i = 0; i < 6; i++)
{
w[i] = c_uniform[i] / (x - double(i));
den += w[i];
}
for (int i = 0; i < 6; i++)
w[i] /= den;
return;
}
lagrange_unit_weights(x, ordn, w);
}
inline bool fast_interp_map_index(int idx, int extent, int d,
int &mapped, unsigned char &mask)
{
if (idx > 0)
mapped = idx;
else
{
mask |= (unsigned char)(1u << d);
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
mapped = 2 - idx;
#else
#ifdef Cell
mapped = 1 - idx;
#else
#error Not define Vertex nor Cell
#endif
#endif
}
return mapped >= 1 && mapped <= extent;
}
bool prepare_fast_interp_stencil(Block *BP, const double *pox, int ordn,
int Symmetry, FastInterpStencil &st)
{
if (!BP || ordn <= 0 || ordn > 8)
return false;
st.nsamples = 0;
const int NO_SYMM = 0;
const int OCTANT = 2;
int cmin[dim], cmax[dim], cxT[dim];
for (int d = 0; d < dim; d++)
{
const double *X = BP->X[d];
const double dX = X[1] - X[0];
const int cxI = fortran_idint_local((pox[d] - X[0]) / dX + 0.4) + 1;
st.cxB[d] = cxI - ordn / 2 + 1;
cxT[d] = st.cxB[d] + ordn - 1;
cmin[d] = 1;
cmax[d] = BP->shape[d];
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
cmin[d] = -ordn / 2 + 2;
if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
cmin[d] = -ordn / 2 + 2;
#else
#ifdef Cell
if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
cmin[d] = -ordn / 2 + 1;
if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
cmin[d] = -ordn / 2 + 1;
#else
#error Not define Vertex nor Cell
#endif
#endif
if (st.cxB[d] < cmin[d])
{
st.cxB[d] = cmin[d];
cxT[d] = st.cxB[d] + ordn - 1;
}
if (cxT[d] > cmax[d])
{
cxT[d] = cmax[d];
st.cxB[d] = cxT[d] + 1 - ordn;
}
if (st.cxB[d] > 0)
st.cx[d] = (pox[d] - X[st.cxB[d] - 1]) / dX;
else
{
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
st.cx[d] = (pox[d] + X[1 - st.cxB[d]]) / dX;
#else
#ifdef Cell
st.cx[d] = (pox[d] + X[-st.cxB[d]]) / dX;
#else
#error Not define Vertex nor Cell
#endif
#endif
}
}
lagrange_unit_weights(st.cx[0], ordn, st.wx);
lagrange_unit_weights(st.cx[1], ordn, st.wy);
z_unit_weights(st.cx[2], ordn, st.wz);
for (int kk = 0; kk < ordn; kk++)
{
for (int jj = 0; jj < ordn; jj++)
{
for (int ii = 0; ii < ordn; ii++)
{
unsigned char mask = 0;
int ix, iy, iz;
if (!fast_interp_map_index(st.cxB[0] + ii, BP->shape[0], 0, ix, mask) ||
!fast_interp_map_index(st.cxB[1] + jj, BP->shape[1], 1, iy, mask) ||
!fast_interp_map_index(st.cxB[2] + kk, BP->shape[2], 2, iz, mask))
return false;
const int s = st.nsamples++;
st.loc[s] = (ix - 1) + (iy - 1) * BP->shape[0] +
(iz - 1) * BP->shape[0] * BP->shape[1];
st.sign_mask[s] = mask;
st.weight[s] = st.wx[ii] * st.wy[jj] * st.wz[kk];
}
}
}
return true;
}
bool interpolate_var_list_with_stencil(Block *BP, MyList<var> *VarList,
int num_var, const double *pox,
int ordn, int Symmetry,
const FastInterpStencil &st,
double *out)
{
if (num_var <= 0 || num_var > 128)
return false;
double *data_ptrs[128];
double *soa_ptrs[128];
var *vars[128];
MyList<var> *varl = VarList;
int k = 0;
while (varl)
{
if (k >= num_var)
return false;
vars[k] = varl->data;
data_ptrs[k] = BP->fgfs[vars[k]->sgfn];
soa_ptrs[k] = vars[k]->SoA;
out[k] = 0.0;
varl = varl->next;
k++;
}
if (k != num_var)
return false;
for (int s = 0; s < st.nsamples; s++)
{
const int loc = st.loc[s];
const double w = st.weight[s];
const unsigned char mask = st.sign_mask[s];
if (mask == 0)
{
for (int v = 0; v < num_var; v++)
out[v] += w * data_ptrs[v][loc];
}
else
{
for (int v = 0; v < num_var; v++)
{
const double *SoA = soa_ptrs[v];
double sgn = 1.0;
if (mask & 1u)
sgn *= SoA[0];
if (mask & 2u)
sgn *= SoA[1];
if (mask & 4u)
sgn *= SoA[2];
out[v] += w * sgn * data_ptrs[v][loc];
}
}
}
if (interp_fast_compare_enabled())
{
static int report_count = 0;
static long long compare_calls = 0;
if (compare_calls++ >= interp_fast_compare_limit())
return true;
const double tol = interp_fast_compare_tol();
varl = VarList;
k = 0;
while (varl)
{
var *vp = vars[k];
double ref = 0.0;
double x = pox[0], y = pox[1], z = pox[2];
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
BP->fgfs[vp->sgfn], ref,
x, y, z, ordn, vp->SoA, Symmetry);
const double diff = fabs(ref - out[k]);
const double scale = 1.0 + fabs(ref);
if (diff > tol * scale && report_count < 32)
{
int rank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stderr,
"[AMSS-INTERP-CMP][rank %d] var=%s diff=%.17e ref=%.17e fast=%.17e p=(%.17e,%.17e,%.17e)\n",
rank, vp->name, diff, ref, out[k], pox[0], pox[1], pox[2]);
report_count++;
}
varl = varl->next;
k++;
}
}
return true;
}
bool interpolate_var_list_fast(Block *BP, MyList<var> *VarList, int num_var,
const double *pox, int ordn, int Symmetry,
double *out)
{
if (!interp_fast_enabled())
return false;
FastInterpStencil st;
if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, st))
return false;
return interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
ordn, Symmetry, st, out);
}
struct CachedInterpPoint
{
Block *bp;
int owner_rank;
FastInterpStencil stencil;
};
struct SurfaceInterpCache
{
Patch *patch;
int NN;
int symmetry;
double key[9];
vector<CachedInterpPoint> points;
SurfaceInterpCache() : patch(0), NN(0), symmetry(-1) {}
};
bool surface_cache_key_matches(const SurfaceInterpCache &cache, Patch *patch,
int NN, double **XX, int Symmetry)
{
if (cache.patch != patch || cache.NN != NN || cache.symmetry != Symmetry ||
int(cache.points.size()) != NN || NN <= 0)
return false;
const int mid = NN / 2;
const int last = NN - 1;
const int ids[3] = {0, mid, last};
int p = 0;
for (int q = 0; q < 3; q++)
for (int d = 0; d < dim; d++)
if (cache.key[p++] != XX[d][ids[q]])
return false;
return true;
}
SurfaceInterpCache *find_surface_cache(Patch *patch, int NN, double **XX,
int Symmetry)
{
static vector<SurfaceInterpCache> caches;
for (size_t i = 0; i < caches.size(); i++)
if (surface_cache_key_matches(caches[i], patch, NN, XX, Symmetry))
return &caches[i];
if (caches.size() >= 24)
caches.erase(caches.begin());
caches.push_back(SurfaceInterpCache());
return &caches.back();
}
bool build_surface_cache(SurfaceInterpCache &cache, Patch *patch, int NN,
double **XX, int Symmetry, const double *DH,
const BlockBinIndex &block_index, int ordn)
{
int myrank = 0;
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
cache.patch = patch;
cache.NN = NN;
cache.symmetry = Symmetry;
cache.points.clear();
cache.points.resize(NN);
const int mid = NN / 2;
const int last = NN - 1;
const int ids[3] = {0, mid, last};
int p = 0;
for (int q = 0; q < 3; q++)
for (int d = 0; d < dim; d++)
cache.key[p++] = XX[d][ids[q]];
for (int j = 0; j < NN; j++)
{
double pox[dim];
for (int d = 0; d < dim; d++)
pox[d] = XX[d][j];
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i < 0)
{
cache.points[j].bp = 0;
cache.points[j].owner_rank = -1;
continue;
}
Block *BP = block_index.views[block_i].bp;
cache.points[j].bp = BP;
cache.points[j].owner_rank = BP->rank;
cache.points[j].stencil.nsamples = 0;
if (BP->rank == myrank)
{
if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry,
cache.points[j].stencil))
return false;
}
}
return true;
}
} // namespace
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
@@ -565,6 +992,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (myrank == BP->rank)
{
//---> interpolation
if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
Symmetry, Shellf + j * num_var))
{
varl = VarList;
int k = 0;
while (varl) // run along variables
@@ -577,6 +1007,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
}
}
// Replace MPI_Allreduce with per-owner MPI_Bcast:
// Group consecutive points by owner rank and broadcast each group.
@@ -659,8 +1090,6 @@ void Patch::Interp_Points(MyList<var> *VarList,
varl = varl->next;
}
memset(Shellf, 0, sizeof(double) * NN * num_var);
// owner_rank[j] records which MPI rank owns point j
int *owner_rank;
owner_rank = new int[NN];
@@ -672,6 +1101,16 @@ void Patch::Interp_Points(MyList<var> *VarList,
DH[i] = getdX(i);
BlockBinIndex block_index;
build_block_bin_index(this, DH, block_index);
SurfaceInterpCache *surface_cache = 0;
bool use_surface_cache = false;
if (interp_fast_enabled())
{
surface_cache = find_surface_cache(this, NN, XX, Symmetry);
use_surface_cache = surface_cache_key_matches(*surface_cache, this, NN, XX, Symmetry);
if (!use_surface_cache)
use_surface_cache = build_surface_cache(*surface_cache, this, NN, XX,
Symmetry, DH, block_index, ordn);
}
// --- Interpolation phase (identical to original) ---
for (int j = 0; j < NN; j++)
@@ -695,6 +1134,31 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
if (use_surface_cache)
{
CachedInterpPoint &cp = surface_cache->points[j];
Block *BP = cp.bp;
owner_rank[j] = cp.owner_rank;
if (BP && myrank == BP->rank)
{
if (!interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
ordn, Symmetry, cp.stencil,
Shellf + j * num_var))
{
MyList<var> *varl_fallback = VarList;
int k = 0;
while (varl_fallback)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
varl_fallback = varl_fallback->next;
k++;
}
}
}
}
else
{
const int block_i = find_block_index_for_point(block_index, pox, DH);
if (block_i >= 0)
{
@@ -702,18 +1166,23 @@ void Patch::Interp_Points(MyList<var> *VarList,
owner_rank[j] = BP->rank;
if (myrank == BP->rank)
{
varl = VarList;
int k = 0;
while (varl)
if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
Symmetry, Shellf + j * num_var))
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
varl = varl->next;
MyList<var> *varl_fallback = VarList;
int k = 0;
while (varl_fallback)
{
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
varl_fallback = varl_fallback->next;
k++;
}
}
}
}
}
}
#ifdef INTERP_LB_PROFILE
double t_interp_end = MPI_Wtime();
@@ -969,6 +1438,9 @@ void Patch::Interp_Points(MyList<var> *VarList,
if (myrank == BP->rank)
{
//---> interpolation
if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
Symmetry, Shellf + j * num_var))
{
varl = VarList;
int k = 0;
while (varl) // run along variables
@@ -981,6 +1453,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
}
}
// Collect unique global owner ranks and translate to local ranks in Comm_here
// Then broadcast each owner's points via MPI_Bcast on Comm_here

View File

@@ -45,6 +45,20 @@ using namespace std;
#include "derivatives.h"
#include "ricci_gamma.h"
namespace
{
bool amss_analysis_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_ANALYSIS_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
}
// Compile-time switch for per-timestep memory usage collection/printing.
// Default is OFF to reduce overhead in production runs.
#ifndef BSSN_ENABLE_MEM_USAGE_LOG
@@ -7942,6 +7956,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
if (LastAnas >= AnasTime)
{
const bool analysis_timing = amss_analysis_timing_enabled();
const double t_analysis_start = analysis_timing ? MPI_Wtime() : 0.0;
double t_psi4_sec = 0.0;
double t_surface_sec = 0.0;
#ifdef Point_Psi4
#error "not support parallel levels yet"
// Gam_ijk and R_ij have been calculated in Interp_Constraint()
@@ -8134,7 +8152,12 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
#endif
}
#else
{
const double t0 = analysis_timing ? MPI_Wtime() : 0.0;
Compute_Psi4(lev);
if (analysis_timing)
t_psi4_sec += MPI_Wtime() - t0;
}
#endif
double *RP, *IP, *RoutMAP;
int NN = 0;
@@ -8151,6 +8174,7 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
#endif
for (int i = 0; i < decn; i++)
{
const double t_surface0 = analysis_timing ? MPI_Wtime() : 0.0;
#ifdef Point_Psi4
Waveshell->surf_Wave(Rex, GH, SH,
phi, trK,
@@ -8243,6 +8267,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
#endif
#endif
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"end surface integral");
if (analysis_timing)
t_surface_sec += MPI_Wtime() - t_surface0;
#endif
if (i == 0)
{
@@ -8275,6 +8301,13 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
delete[] RP;
delete[] IP;
delete[] RoutMAP;
if (analysis_timing)
{
fprintf(stderr,
"[AMSS-ANALYSIS][rank %d] lev=%d psi4=%.6f surface=%.6f total_before_bh=%.6f detectors=%d modes=%d\n",
myrank, lev, t_psi4_sec, t_surface_sec,
MPI_Wtime() - t_analysis_start, decn, NN);
}
// black hole's position
{

View File

@@ -11,6 +11,7 @@
#include <strstream>
#include <cmath>
#include <map>
#include <cstdlib>
using namespace std;
#else
#include <iostream.h>
@@ -32,6 +33,20 @@ using namespace std;
#include "parameters.h"
#define PI M_PI
namespace
{
bool amss_surface_timing_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_SURFACE_TIMING");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
}
//|============================================================================
//| Constructor
//|============================================================================
@@ -3281,6 +3296,8 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
double *Rout, monitor *Monitor, bool refresh_mass_fields)
{
const bool timing = amss_surface_timing_enabled();
const double t_start = timing ? MPI_Wtime() : 0.0;
if (Symmetry != 0 && Symmetry != 1)
{
surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor);
@@ -3325,6 +3342,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
Pp = Pp->next;
}
}
const double t_refresh_done = timing ? MPI_Wtime() : 0.0;
const int InList = 19;
const int idx_rpsi4 = 0, idx_ipsi4 = 1;
@@ -3380,6 +3398,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
double *shellf = new double[n_tot * InList];
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
const double t_interp_done = timing ? MPI_Wtime() : 0.0;
double *RP_out = new double[NN];
double *IP_out = new double[NN];
@@ -3496,6 +3515,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
if (Symmetry == 0)
p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight;
}
const double t_integral_done = timing ? MPI_Wtime() : 0.0;
for (int ii = 0; ii < NN; ii++)
{
@@ -3534,6 +3554,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
delete[] reduce_out;
delete[] reduce_in;
}
const double t_reduce_done = timing ? MPI_Wtime() : 0.0;
#ifdef GaussInt
mass = mass * rex * rex * dphi * factor;
@@ -3565,6 +3586,19 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
Rout[5] = sy;
Rout[6] = sz;
if (timing)
{
fprintf(stderr,
"[AMSS-SURFACE][rank %d] rex=%.6g lev=%d refresh=%.6f interp=%.6f integral=%.6f reduce=%.6f total=%.6f nlocal=%d ntotal=%d modes=%d\n",
myrank, rex, lev,
t_refresh_done - t_start,
t_interp_done - t_refresh_done,
t_integral_done - t_interp_done,
t_reduce_done - t_integral_done,
t_reduce_done - t_start,
Nmax - Nmin + 1, n_tot, NN);
}
delete[] pox[0];
delete[] pox[1];
delete[] pox[2];