Optimize BSSN surface interpolation fast path
This commit is contained in:
@@ -154,8 +154,8 @@ void build_block_bin_index(Patch *patch, const double *DH, BlockBinIndex &index)
|
||||
index.valid = true;
|
||||
}
|
||||
|
||||
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
||||
{
|
||||
int find_block_index_for_point(const BlockBinIndex &index, const double *pox, const double *DH)
|
||||
{
|
||||
if (!index.valid)
|
||||
return -1;
|
||||
|
||||
@@ -175,10 +175,437 @@ int find_block_index_for_point(const BlockBinIndex &index, const double *pox, co
|
||||
for (size_t bi = 0; bi < index.views.size(); bi++)
|
||||
if (point_in_block_view(index.views[bi], pox, DH))
|
||||
return int(bi);
|
||||
|
||||
return -1;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
inline int fortran_idint_local(double x)
|
||||
{
|
||||
return int(x);
|
||||
}
|
||||
|
||||
bool interp_fast_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_INTERP_FAST");
|
||||
enabled = (!env || atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool interp_fast_compare_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_INTERP_FAST_COMPARE");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
double interp_fast_compare_tol()
|
||||
{
|
||||
static double tol = -1.0;
|
||||
if (tol < 0.0)
|
||||
{
|
||||
const char *env = getenv("AMSS_INTERP_FAST_COMPARE_TOL");
|
||||
tol = (env && atof(env) > 0.0) ? atof(env) : 1.0e-11;
|
||||
}
|
||||
return tol;
|
||||
}
|
||||
|
||||
long long interp_fast_compare_limit()
|
||||
{
|
||||
static long long limit = -1;
|
||||
if (limit < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_INTERP_FAST_COMPARE_LIMIT");
|
||||
limit = (env && atoll(env) > 0) ? atoll(env) : 4096;
|
||||
}
|
||||
return limit;
|
||||
}
|
||||
|
||||
struct FastInterpStencil
|
||||
{
|
||||
int cxB[dim];
|
||||
double cx[dim];
|
||||
double wx[8];
|
||||
double wy[8];
|
||||
double wz[8];
|
||||
int nsamples;
|
||||
int loc[512];
|
||||
unsigned char sign_mask[512];
|
||||
double weight[512];
|
||||
};
|
||||
|
||||
inline void lagrange_unit_weights(double x, int ordn, double *w)
|
||||
{
|
||||
for (int i = 0; i < ordn; i++)
|
||||
{
|
||||
double num = 1.0;
|
||||
double den = 1.0;
|
||||
for (int j = 0; j < ordn; j++)
|
||||
{
|
||||
if (j == i)
|
||||
continue;
|
||||
num *= (x - double(j));
|
||||
den *= double(i - j);
|
||||
}
|
||||
w[i] = num / den;
|
||||
}
|
||||
}
|
||||
|
||||
inline void z_unit_weights(double x, int ordn, double *w)
|
||||
{
|
||||
if (ordn == 6)
|
||||
{
|
||||
static const double c_uniform[6] = {-1.0, 5.0, -10.0, 10.0, -5.0, 1.0};
|
||||
for (int i = 0; i < 6; i++)
|
||||
{
|
||||
if (x == double(i))
|
||||
{
|
||||
for (int j = 0; j < 6; j++)
|
||||
w[j] = (j == i) ? 1.0 : 0.0;
|
||||
return;
|
||||
}
|
||||
}
|
||||
double den = 0.0;
|
||||
for (int i = 0; i < 6; i++)
|
||||
{
|
||||
w[i] = c_uniform[i] / (x - double(i));
|
||||
den += w[i];
|
||||
}
|
||||
for (int i = 0; i < 6; i++)
|
||||
w[i] /= den;
|
||||
return;
|
||||
}
|
||||
lagrange_unit_weights(x, ordn, w);
|
||||
}
|
||||
|
||||
inline bool fast_interp_map_index(int idx, int extent, int d,
|
||||
int &mapped, unsigned char &mask)
|
||||
{
|
||||
if (idx > 0)
|
||||
mapped = idx;
|
||||
else
|
||||
{
|
||||
mask |= (unsigned char)(1u << d);
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
mapped = 2 - idx;
|
||||
#else
|
||||
#ifdef Cell
|
||||
mapped = 1 - idx;
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
return mapped >= 1 && mapped <= extent;
|
||||
}
|
||||
|
||||
bool prepare_fast_interp_stencil(Block *BP, const double *pox, int ordn,
|
||||
int Symmetry, FastInterpStencil &st)
|
||||
{
|
||||
if (!BP || ordn <= 0 || ordn > 8)
|
||||
return false;
|
||||
|
||||
st.nsamples = 0;
|
||||
|
||||
const int NO_SYMM = 0;
|
||||
const int OCTANT = 2;
|
||||
int cmin[dim], cmax[dim], cxT[dim];
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
const double *X = BP->X[d];
|
||||
const double dX = X[1] - X[0];
|
||||
const int cxI = fortran_idint_local((pox[d] - X[0]) / dX + 0.4) + 1;
|
||||
st.cxB[d] = cxI - ordn / 2 + 1;
|
||||
cxT[d] = st.cxB[d] + ordn - 1;
|
||||
cmin[d] = 1;
|
||||
cmax[d] = BP->shape[d];
|
||||
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
|
||||
cmin[d] = -ordn / 2 + 2;
|
||||
if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
|
||||
cmin[d] = -ordn / 2 + 2;
|
||||
#else
|
||||
#ifdef Cell
|
||||
if (Symmetry == OCTANT && d < 2 && fabs(X[0]) < dX)
|
||||
cmin[d] = -ordn / 2 + 1;
|
||||
if (Symmetry != NO_SYMM && d == 2 && fabs(X[0]) < dX)
|
||||
cmin[d] = -ordn / 2 + 1;
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (st.cxB[d] < cmin[d])
|
||||
{
|
||||
st.cxB[d] = cmin[d];
|
||||
cxT[d] = st.cxB[d] + ordn - 1;
|
||||
}
|
||||
if (cxT[d] > cmax[d])
|
||||
{
|
||||
cxT[d] = cmax[d];
|
||||
st.cxB[d] = cxT[d] + 1 - ordn;
|
||||
}
|
||||
|
||||
if (st.cxB[d] > 0)
|
||||
st.cx[d] = (pox[d] - X[st.cxB[d] - 1]) / dX;
|
||||
else
|
||||
{
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
st.cx[d] = (pox[d] + X[1 - st.cxB[d]]) / dX;
|
||||
#else
|
||||
#ifdef Cell
|
||||
st.cx[d] = (pox[d] + X[-st.cxB[d]]) / dX;
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
lagrange_unit_weights(st.cx[0], ordn, st.wx);
|
||||
lagrange_unit_weights(st.cx[1], ordn, st.wy);
|
||||
z_unit_weights(st.cx[2], ordn, st.wz);
|
||||
|
||||
for (int kk = 0; kk < ordn; kk++)
|
||||
{
|
||||
for (int jj = 0; jj < ordn; jj++)
|
||||
{
|
||||
for (int ii = 0; ii < ordn; ii++)
|
||||
{
|
||||
unsigned char mask = 0;
|
||||
int ix, iy, iz;
|
||||
if (!fast_interp_map_index(st.cxB[0] + ii, BP->shape[0], 0, ix, mask) ||
|
||||
!fast_interp_map_index(st.cxB[1] + jj, BP->shape[1], 1, iy, mask) ||
|
||||
!fast_interp_map_index(st.cxB[2] + kk, BP->shape[2], 2, iz, mask))
|
||||
return false;
|
||||
const int s = st.nsamples++;
|
||||
st.loc[s] = (ix - 1) + (iy - 1) * BP->shape[0] +
|
||||
(iz - 1) * BP->shape[0] * BP->shape[1];
|
||||
st.sign_mask[s] = mask;
|
||||
st.weight[s] = st.wx[ii] * st.wy[jj] * st.wz[kk];
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool interpolate_var_list_with_stencil(Block *BP, MyList<var> *VarList,
|
||||
int num_var, const double *pox,
|
||||
int ordn, int Symmetry,
|
||||
const FastInterpStencil &st,
|
||||
double *out)
|
||||
{
|
||||
if (num_var <= 0 || num_var > 128)
|
||||
return false;
|
||||
|
||||
double *data_ptrs[128];
|
||||
double *soa_ptrs[128];
|
||||
var *vars[128];
|
||||
MyList<var> *varl = VarList;
|
||||
int k = 0;
|
||||
while (varl)
|
||||
{
|
||||
if (k >= num_var)
|
||||
return false;
|
||||
vars[k] = varl->data;
|
||||
data_ptrs[k] = BP->fgfs[vars[k]->sgfn];
|
||||
soa_ptrs[k] = vars[k]->SoA;
|
||||
out[k] = 0.0;
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
|
||||
if (k != num_var)
|
||||
return false;
|
||||
|
||||
for (int s = 0; s < st.nsamples; s++)
|
||||
{
|
||||
const int loc = st.loc[s];
|
||||
const double w = st.weight[s];
|
||||
const unsigned char mask = st.sign_mask[s];
|
||||
if (mask == 0)
|
||||
{
|
||||
for (int v = 0; v < num_var; v++)
|
||||
out[v] += w * data_ptrs[v][loc];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int v = 0; v < num_var; v++)
|
||||
{
|
||||
const double *SoA = soa_ptrs[v];
|
||||
double sgn = 1.0;
|
||||
if (mask & 1u)
|
||||
sgn *= SoA[0];
|
||||
if (mask & 2u)
|
||||
sgn *= SoA[1];
|
||||
if (mask & 4u)
|
||||
sgn *= SoA[2];
|
||||
out[v] += w * sgn * data_ptrs[v][loc];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (interp_fast_compare_enabled())
|
||||
{
|
||||
static int report_count = 0;
|
||||
static long long compare_calls = 0;
|
||||
if (compare_calls++ >= interp_fast_compare_limit())
|
||||
return true;
|
||||
const double tol = interp_fast_compare_tol();
|
||||
varl = VarList;
|
||||
k = 0;
|
||||
while (varl)
|
||||
{
|
||||
var *vp = vars[k];
|
||||
double ref = 0.0;
|
||||
double x = pox[0], y = pox[1], z = pox[2];
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2],
|
||||
BP->fgfs[vp->sgfn], ref,
|
||||
x, y, z, ordn, vp->SoA, Symmetry);
|
||||
const double diff = fabs(ref - out[k]);
|
||||
const double scale = 1.0 + fabs(ref);
|
||||
if (diff > tol * scale && report_count < 32)
|
||||
{
|
||||
int rank = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
|
||||
fprintf(stderr,
|
||||
"[AMSS-INTERP-CMP][rank %d] var=%s diff=%.17e ref=%.17e fast=%.17e p=(%.17e,%.17e,%.17e)\n",
|
||||
rank, vp->name, diff, ref, out[k], pox[0], pox[1], pox[2]);
|
||||
report_count++;
|
||||
}
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool interpolate_var_list_fast(Block *BP, MyList<var> *VarList, int num_var,
|
||||
const double *pox, int ordn, int Symmetry,
|
||||
double *out)
|
||||
{
|
||||
if (!interp_fast_enabled())
|
||||
return false;
|
||||
|
||||
FastInterpStencil st;
|
||||
if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry, st))
|
||||
return false;
|
||||
|
||||
return interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
|
||||
ordn, Symmetry, st, out);
|
||||
}
|
||||
|
||||
struct CachedInterpPoint
|
||||
{
|
||||
Block *bp;
|
||||
int owner_rank;
|
||||
FastInterpStencil stencil;
|
||||
};
|
||||
|
||||
struct SurfaceInterpCache
|
||||
{
|
||||
Patch *patch;
|
||||
int NN;
|
||||
int symmetry;
|
||||
double key[9];
|
||||
vector<CachedInterpPoint> points;
|
||||
|
||||
SurfaceInterpCache() : patch(0), NN(0), symmetry(-1) {}
|
||||
};
|
||||
|
||||
bool surface_cache_key_matches(const SurfaceInterpCache &cache, Patch *patch,
|
||||
int NN, double **XX, int Symmetry)
|
||||
{
|
||||
if (cache.patch != patch || cache.NN != NN || cache.symmetry != Symmetry ||
|
||||
int(cache.points.size()) != NN || NN <= 0)
|
||||
return false;
|
||||
const int mid = NN / 2;
|
||||
const int last = NN - 1;
|
||||
const int ids[3] = {0, mid, last};
|
||||
int p = 0;
|
||||
for (int q = 0; q < 3; q++)
|
||||
for (int d = 0; d < dim; d++)
|
||||
if (cache.key[p++] != XX[d][ids[q]])
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
SurfaceInterpCache *find_surface_cache(Patch *patch, int NN, double **XX,
|
||||
int Symmetry)
|
||||
{
|
||||
static vector<SurfaceInterpCache> caches;
|
||||
for (size_t i = 0; i < caches.size(); i++)
|
||||
if (surface_cache_key_matches(caches[i], patch, NN, XX, Symmetry))
|
||||
return &caches[i];
|
||||
if (caches.size() >= 24)
|
||||
caches.erase(caches.begin());
|
||||
caches.push_back(SurfaceInterpCache());
|
||||
return &caches.back();
|
||||
}
|
||||
|
||||
bool build_surface_cache(SurfaceInterpCache &cache, Patch *patch, int NN,
|
||||
double **XX, int Symmetry, const double *DH,
|
||||
const BlockBinIndex &block_index, int ordn)
|
||||
{
|
||||
int myrank = 0;
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
||||
cache.patch = patch;
|
||||
cache.NN = NN;
|
||||
cache.symmetry = Symmetry;
|
||||
cache.points.clear();
|
||||
cache.points.resize(NN);
|
||||
const int mid = NN / 2;
|
||||
const int last = NN - 1;
|
||||
const int ids[3] = {0, mid, last};
|
||||
int p = 0;
|
||||
for (int q = 0; q < 3; q++)
|
||||
for (int d = 0; d < dim; d++)
|
||||
cache.key[p++] = XX[d][ids[q]];
|
||||
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
double pox[dim];
|
||||
for (int d = 0; d < dim; d++)
|
||||
pox[d] = XX[d][j];
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i < 0)
|
||||
{
|
||||
cache.points[j].bp = 0;
|
||||
cache.points[j].owner_rank = -1;
|
||||
continue;
|
||||
}
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
cache.points[j].bp = BP;
|
||||
cache.points[j].owner_rank = BP->rank;
|
||||
cache.points[j].stencil.nsamples = 0;
|
||||
if (BP->rank == myrank)
|
||||
{
|
||||
if (!prepare_fast_interp_stencil(BP, pox, ordn, Symmetry,
|
||||
cache.points[j].stencil))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
Patch::Patch(int DIM, int *shapei, double *bboxi, int levi, bool buflog, int Symmetry) : lev(levi)
|
||||
{
|
||||
@@ -561,22 +988,26 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
|
||||
Symmetry, Shellf + j * num_var))
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Replace MPI_Allreduce with per-owner MPI_Bcast:
|
||||
// Group consecutive points by owner rank and broadcast each group.
|
||||
@@ -659,10 +1090,8 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
varl = varl->next;
|
||||
}
|
||||
|
||||
memset(Shellf, 0, sizeof(double) * NN * num_var);
|
||||
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
int *owner_rank;
|
||||
// owner_rank[j] records which MPI rank owns point j
|
||||
int *owner_rank;
|
||||
owner_rank = new int[NN];
|
||||
for (int j = 0; j < NN; j++)
|
||||
owner_rank[j] = -1;
|
||||
@@ -670,12 +1099,22 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
double DH[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
DH[i] = getdX(i);
|
||||
BlockBinIndex block_index;
|
||||
build_block_bin_index(this, DH, block_index);
|
||||
|
||||
// --- Interpolation phase (identical to original) ---
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
BlockBinIndex block_index;
|
||||
build_block_bin_index(this, DH, block_index);
|
||||
SurfaceInterpCache *surface_cache = 0;
|
||||
bool use_surface_cache = false;
|
||||
if (interp_fast_enabled())
|
||||
{
|
||||
surface_cache = find_surface_cache(this, NN, XX, Symmetry);
|
||||
use_surface_cache = surface_cache_key_matches(*surface_cache, this, NN, XX, Symmetry);
|
||||
if (!use_surface_cache)
|
||||
use_surface_cache = build_surface_cache(*surface_cache, this, NN, XX,
|
||||
Symmetry, DH, block_index, ordn);
|
||||
}
|
||||
|
||||
// --- Interpolation phase (identical to original) ---
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
double pox[dim];
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
@@ -692,28 +1131,58 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
cout << ") is out of current Patch." << endl;
|
||||
}
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (use_surface_cache)
|
||||
{
|
||||
CachedInterpPoint &cp = surface_cache->points[j];
|
||||
Block *BP = cp.bp;
|
||||
owner_rank[j] = cp.owner_rank;
|
||||
if (BP && myrank == BP->rank)
|
||||
{
|
||||
if (!interpolate_var_list_with_stencil(BP, VarList, num_var, pox,
|
||||
ordn, Symmetry, cp.stencil,
|
||||
Shellf + j * num_var))
|
||||
{
|
||||
MyList<var> *varl_fallback = VarList;
|
||||
int k = 0;
|
||||
while (varl_fallback)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
|
||||
varl_fallback = varl_fallback->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int block_i = find_block_index_for_point(block_index, pox, DH);
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
|
||||
Symmetry, Shellf + j * num_var))
|
||||
{
|
||||
MyList<var> *varl_fallback = VarList;
|
||||
int k = 0;
|
||||
while (varl_fallback)
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl_fallback->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl_fallback->data->SoA, Symmetry);
|
||||
varl_fallback = varl_fallback->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
double t_interp_end = MPI_Wtime();
|
||||
@@ -965,22 +1434,26 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
if (block_i >= 0)
|
||||
{
|
||||
Block *BP = block_index.views[block_i].bp;
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
owner_rank[j] = BP->rank;
|
||||
if (myrank == BP->rank)
|
||||
{
|
||||
//---> interpolation
|
||||
if (!interpolate_var_list_fast(BP, VarList, num_var, pox, ordn,
|
||||
Symmetry, Shellf + j * num_var))
|
||||
{
|
||||
varl = VarList;
|
||||
int k = 0;
|
||||
while (varl) // run along variables
|
||||
{
|
||||
f_global_interp(BP->shape, BP->X[0], BP->X[1], BP->X[2], BP->fgfs[varl->data->sgfn], Shellf[j * num_var + k],
|
||||
pox[0], pox[1], pox[2], ordn, varl->data->SoA, Symmetry);
|
||||
varl = varl->next;
|
||||
k++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Collect unique global owner ranks and translate to local ranks in Comm_here
|
||||
// Then broadcast each owner's points via MPI_Bcast on Comm_here
|
||||
|
||||
@@ -45,6 +45,20 @@ using namespace std;
|
||||
#include "derivatives.h"
|
||||
#include "ricci_gamma.h"
|
||||
|
||||
namespace
|
||||
{
|
||||
bool amss_analysis_timing_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_ANALYSIS_TIMING");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Compile-time switch for per-timestep memory usage collection/printing.
|
||||
// Default is OFF to reduce overhead in production runs.
|
||||
#ifndef BSSN_ENABLE_MEM_USAGE_LOG
|
||||
@@ -7942,6 +7956,10 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
|
||||
if (LastAnas >= AnasTime)
|
||||
{
|
||||
const bool analysis_timing = amss_analysis_timing_enabled();
|
||||
const double t_analysis_start = analysis_timing ? MPI_Wtime() : 0.0;
|
||||
double t_psi4_sec = 0.0;
|
||||
double t_surface_sec = 0.0;
|
||||
#ifdef Point_Psi4
|
||||
#error "not support parallel levels yet"
|
||||
// Gam_ijk and R_ij have been calculated in Interp_Constraint()
|
||||
@@ -8134,7 +8152,12 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
{
|
||||
const double t0 = analysis_timing ? MPI_Wtime() : 0.0;
|
||||
Compute_Psi4(lev);
|
||||
if (analysis_timing)
|
||||
t_psi4_sec += MPI_Wtime() - t0;
|
||||
}
|
||||
#endif
|
||||
double *RP, *IP, *RoutMAP;
|
||||
int NN = 0;
|
||||
@@ -8150,7 +8173,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
bool shell_mass_prepared = false;
|
||||
#endif
|
||||
for (int i = 0; i < decn; i++)
|
||||
{
|
||||
{
|
||||
const double t_surface0 = analysis_timing ? MPI_Wtime() : 0.0;
|
||||
#ifdef Point_Psi4
|
||||
Waveshell->surf_Wave(Rex, GH, SH,
|
||||
phi, trK,
|
||||
@@ -8243,6 +8267,8 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
#endif
|
||||
#endif
|
||||
// misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"end surface integral");
|
||||
if (analysis_timing)
|
||||
t_surface_sec += MPI_Wtime() - t_surface0;
|
||||
#endif
|
||||
if (i == 0)
|
||||
{
|
||||
@@ -8275,6 +8301,13 @@ void bssn_class::AnalysisStuff(int lev, double dT_lev)
|
||||
delete[] RP;
|
||||
delete[] IP;
|
||||
delete[] RoutMAP;
|
||||
if (analysis_timing)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"[AMSS-ANALYSIS][rank %d] lev=%d psi4=%.6f surface=%.6f total_before_bh=%.6f detectors=%d modes=%d\n",
|
||||
myrank, lev, t_psi4_sec, t_surface_sec,
|
||||
MPI_Wtime() - t_analysis_start, decn, NN);
|
||||
}
|
||||
|
||||
// black hole's position
|
||||
{
|
||||
|
||||
@@ -8,10 +8,11 @@
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fstream>
|
||||
#include <strstream>
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
using namespace std;
|
||||
#include <strstream>
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
#include <cstdlib>
|
||||
using namespace std;
|
||||
#else
|
||||
#include <iostream.h>
|
||||
#include <iomanip.h>
|
||||
@@ -29,12 +30,26 @@ using namespace std;
|
||||
#include "fadmquantites_bssn.h"
|
||||
#include "getnpem2.h"
|
||||
#include "getnp4.h"
|
||||
#include "parameters.h"
|
||||
|
||||
#define PI M_PI
|
||||
//|============================================================================
|
||||
//| Constructor
|
||||
//|============================================================================
|
||||
#include "parameters.h"
|
||||
|
||||
#define PI M_PI
|
||||
|
||||
namespace
|
||||
{
|
||||
bool amss_surface_timing_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_SURFACE_TIMING");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
}
|
||||
//|============================================================================
|
||||
//| Constructor
|
||||
//|============================================================================
|
||||
|
||||
surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry),
|
||||
wave_cache_spinw(-1),
|
||||
@@ -3281,6 +3296,8 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
|
||||
var *Sfx_rhs, var *Sfy_rhs, var *Sfz_rhs,
|
||||
double *Rout, monitor *Monitor, bool refresh_mass_fields)
|
||||
{
|
||||
const bool timing = amss_surface_timing_enabled();
|
||||
const double t_start = timing ? MPI_Wtime() : 0.0;
|
||||
if (Symmetry != 0 && Symmetry != 1)
|
||||
{
|
||||
surf_Wave(rex, lev, GH, Rpsi4, Ipsi4, spinw, maxl, NN, RP, IP, Monitor);
|
||||
@@ -3325,6 +3342,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
const double t_refresh_done = timing ? MPI_Wtime() : 0.0;
|
||||
|
||||
const int InList = 19;
|
||||
const int idx_rpsi4 = 0, idx_ipsi4 = 1;
|
||||
@@ -3380,6 +3398,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
|
||||
|
||||
double *shellf = new double[n_tot * InList];
|
||||
GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax);
|
||||
const double t_interp_done = timing ? MPI_Wtime() : 0.0;
|
||||
|
||||
double *RP_out = new double[NN];
|
||||
double *IP_out = new double[NN];
|
||||
@@ -3496,6 +3515,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
|
||||
if (Symmetry == 0)
|
||||
p_outz += f1o8 * Psi * (nx_g[n] * axz + ny_g[n] * ayz + nz_g[n] * azz) * theta_weight;
|
||||
}
|
||||
const double t_integral_done = timing ? MPI_Wtime() : 0.0;
|
||||
|
||||
for (int ii = 0; ii < NN; ii++)
|
||||
{
|
||||
@@ -3534,6 +3554,7 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
|
||||
delete[] reduce_out;
|
||||
delete[] reduce_in;
|
||||
}
|
||||
const double t_reduce_done = timing ? MPI_Wtime() : 0.0;
|
||||
|
||||
#ifdef GaussInt
|
||||
mass = mass * rex * rex * dphi * factor;
|
||||
@@ -3565,6 +3586,19 @@ void surface_integral::surf_WaveMassPAng(double rex, int lev, cgh *GH,
|
||||
Rout[5] = sy;
|
||||
Rout[6] = sz;
|
||||
|
||||
if (timing)
|
||||
{
|
||||
fprintf(stderr,
|
||||
"[AMSS-SURFACE][rank %d] rex=%.6g lev=%d refresh=%.6f interp=%.6f integral=%.6f reduce=%.6f total=%.6f nlocal=%d ntotal=%d modes=%d\n",
|
||||
myrank, rex, lev,
|
||||
t_refresh_done - t_start,
|
||||
t_interp_done - t_refresh_done,
|
||||
t_integral_done - t_interp_done,
|
||||
t_reduce_done - t_integral_done,
|
||||
t_reduce_done - t_start,
|
||||
Nmax - Nmin + 1, n_tot, NN);
|
||||
}
|
||||
|
||||
delete[] pox[0];
|
||||
delete[] pox[1];
|
||||
delete[] pox[2];
|
||||
|
||||
Reference in New Issue
Block a user