From 42e851d19a5bb1c1d513ca5f367c8a541d71833c Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 15:21:01 +0800 Subject: [PATCH] Cache repeated interpolation plans --- AMSS_NCKU_source/MPatch.C | 438 ++++++++++++++-------------- AMSS_NCKU_source/bssn_cuda_ops.cu | 135 ++++++--- AMSS_NCKU_source/surface_integral.C | 179 ++++++------ AMSS_NCKU_source/surface_integral.h | 46 ++- 4 files changed, 428 insertions(+), 370 deletions(-) diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 18fe7a4..68ac732 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -2,13 +2,14 @@ #include #include #include -#include -#include -#include -#include -#include -#include -using namespace std; +#include +#include +#include +#include +#include +#include +#include +using namespace std; #include "misc.h" #include "MPatch.h" @@ -40,6 +41,44 @@ struct InterpVarDesc double soa[dim]; }; +struct InterpPlanKey +{ + const Patch *patch; + const double *x; + const double *y; + const double *z; + int NN; + int Symmetry; + int myrank; +}; + +struct InterpPlanKeyLess +{ + bool operator()(const InterpPlanKey &lhs, const InterpPlanKey &rhs) const + { + if (lhs.patch != rhs.patch) return lhs.patch < rhs.patch; + if (lhs.x != rhs.x) return lhs.x < rhs.x; + if (lhs.y != rhs.y) return lhs.y < rhs.y; + if (lhs.z != rhs.z) return lhs.z < rhs.z; + if (lhs.NN != rhs.NN) return lhs.NN < rhs.NN; + if (lhs.Symmetry != rhs.Symmetry) return lhs.Symmetry < rhs.Symmetry; + return lhs.myrank < rhs.myrank; + } +}; + +struct CachedInterpPlan +{ + int nblocks; + vector owner_rank; + vector owner_block; + vector > block_points; + vector > block_px; + vector > block_py; + vector > block_pz; + + CachedInterpPlan() : nblocks(0) {} +}; + struct InterpBlockView { Block *bp; @@ -229,10 +268,124 @@ bool should_try_cuda_interp(int ordn, int num_points, int num_var) return num_points * num_var >= 256; } +CachedInterpPlan &get_cached_interp_plan(Patch *patch, + int NN, double **XX, + int Symmetry, int myrank, + const double *DH, + const BlockBinIndex &block_index, + bool report_bounds_here, + bool allow_missing_points) +{ + static map cache; + + InterpPlanKey key; + key.patch = patch; + key.x = XX[0]; + key.y = XX[1]; + key.z = XX[2]; + key.NN = NN; + key.Symmetry = Symmetry; + key.myrank = myrank; + + map::iterator it = cache.find(key); + if (it != cache.end() && it->second.nblocks == static_cast(block_index.views.size())) + return it->second; + + CachedInterpPlan &plan = cache[key]; + plan = CachedInterpPlan(); + plan.nblocks = static_cast(block_index.views.size()); + plan.owner_rank.assign(NN, -1); + plan.owner_block.assign(NN, -1); + plan.block_points.resize(plan.nblocks); + plan.block_px.resize(plan.nblocks); + plan.block_py.resize(plan.nblocks); + plan.block_pz.resize(plan.nblocks); + + for (int j = 0; j < NN; ++j) + { + double pox[dim]; + for (int i = 0; i < dim; ++i) + { + pox[i] = XX[i][j]; + if (report_bounds_here && + (XX[i][j] < patch->bbox[i] + patch->lli[i] * DH[i] || + XX[i][j] > patch->bbox[dim + i] - patch->uui[i] * DH[i])) + { + cout << "Patch::Interp_Points: point ("; + for (int k = 0; k < dim; ++k) + { + cout << XX[k][j]; + if (k < dim - 1) + cout << ","; + else + cout << ") is out of current Patch." << endl; + } + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + const int block_i = find_block_index_for_point(block_index, pox, DH); + if (block_i >= 0) + { + Block *BP = block_index.views[block_i].bp; + plan.owner_rank[j] = BP->rank; + plan.owner_block[j] = block_i; + if (BP->rank == myrank) + { + plan.block_points[block_i].push_back(j); + plan.block_px[block_i].push_back(XX[0][j]); + plan.block_py[block_i].push_back(XX[1][j]); + plan.block_pz[block_i].push_back(XX[2][j]); + } + } + } + + if (!allow_missing_points && report_bounds_here) + { + for (int j = 0; j < NN; ++j) + { + if (plan.owner_rank[j] >= 0) + continue; + cout << "ERROR: Patch::Interp_Points fails to find point ("; + for (int d = 0; d < dim; ++d) + { + cout << XX[d][j]; + if (d < dim - 1) + cout << ","; + else + cout << ")"; + } + cout << " on Patch ("; + for (int d = 0; d < dim; ++d) + { + cout << patch->bbox[d] << "+" << patch->lli[d] * DH[d]; + if (d < dim - 1) + cout << ","; + else + cout << ")--"; + } + cout << "("; + for (int d = 0; d < dim; ++d) + { + cout << patch->bbox[dim + d] << "-" << patch->uui[d] * DH[d]; + if (d < dim - 1) + cout << ","; + else + cout << ")" << endl; + } + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + return plan; +} + bool run_cuda_interp_for_block(Block *BP, const vector &vars, const vector &point_ids, - double **XX, + const vector &px, + const vector &py, + const vector &pz, double *Shellf, int num_var, int ordn, @@ -251,15 +404,6 @@ bool run_cuda_interp_for_block(Block *BP, } const int npts = static_cast(point_ids.size()); - vector px(npts), py(npts), pz(npts); - for (int p = 0; p < npts; ++p) - { - const int j = point_ids[p]; - px[p] = XX[0][j]; - py[p] = XX[1][j]; - pz[p] = XX[2][j]; - } - vector out(static_cast(npts) * static_cast(num_var)); if (bssn_cuda_interp_points_batch(BP->shape, BP->X[0], BP->X[1], BP->X[2], @@ -286,7 +430,9 @@ bool run_cuda_interp_for_block(Block *BP, void run_cpu_interp_for_block(Block *BP, const vector &vars, const vector &point_ids, - double **XX, + const vector &px, + const vector &py, + const vector &pz, double *Shellf, int num_var, int ordn, @@ -295,9 +441,9 @@ void run_cpu_interp_for_block(Block *BP, for (size_t p = 0; p < point_ids.size(); ++p) { const int j = point_ids[p]; - double x = XX[0][j]; - double y = XX[1][j]; - double z = XX[2][j]; + double x = px[p]; + double y = py[p]; + double z = pz[p]; int ordn_local = ordn; int symmetry_local = Symmetry; for (int v = 0; v < num_var; ++v) @@ -310,33 +456,34 @@ void run_cpu_interp_for_block(Block *BP, } void interpolate_owned_points(MyList *VarList, - int NN, double **XX, double *Shellf, int Symmetry, - int myrank, int ordn, + int ordn, const BlockBinIndex &block_index, - const int *owner_rank, - const int *owner_block) + const CachedInterpPlan &plan) { vector vars; collect_interp_vars(VarList, vars); const int num_var = static_cast(vars.size()); - vector> block_points(block_index.views.size()); - for (int j = 0; j < NN; ++j) + for (size_t bi = 0; bi < plan.block_points.size(); ++bi) { - if (owner_rank[j] == myrank && owner_block[j] >= 0) - block_points[owner_block[j]].push_back(j); - } - - for (size_t bi = 0; bi < block_points.size(); ++bi) - { - if (block_points[bi].empty()) + if (plan.block_points[bi].empty()) continue; Block *BP = block_index.views[bi].bp; - bool done = run_cuda_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry); + bool done = run_cuda_interp_for_block(BP, vars, + plan.block_points[bi], + plan.block_px[bi], + plan.block_py[bi], + plan.block_pz[bi], + Shellf, num_var, ordn, Symmetry); if (!done) - run_cpu_interp_for_block(BP, vars, block_points[bi], XX, Shellf, num_var, ordn, Symmetry); + run_cpu_interp_for_block(BP, vars, + plan.block_points[bi], + plan.block_px[bi], + plan.block_py[bi], + plan.block_pz[bi], + Shellf, num_var, ordn, Symmetry); } } } // namespace @@ -684,55 +831,15 @@ void Patch::Interp_Points(MyList *VarList, memset(Shellf, 0, sizeof(double) * NN * num_var); - // owner_rank[j] records which MPI rank owns point j - // All ranks traverse the same block list so they all agree on ownership - int *owner_rank; - owner_rank = new int[NN]; - int *owner_block; - owner_block = new int[NN]; - for (int j = 0; j < NN; j++) - { - owner_rank[j] = -1; - owner_block[j] = -1; - } - - double DH[dim]; - for (int i = 0; i < dim; i++) - DH[i] = getdX(i); - BlockBinIndex block_index; - build_block_bin_index(this, DH, block_index); - - for (int j = 0; j < NN; j++) // run along points - { - double pox[dim]; - for (int i = 0; i < dim; i++) - { - pox[i] = XX[i][j]; - if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i])) - { - cout << "Patch::Interp_Points: point ("; - for (int k = 0; k < dim; k++) - { - cout << XX[k][j]; - if (k < dim - 1) - cout << ","; - else - cout << ") is out of current Patch." << endl; - } - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - owner_block[j] = block_i; - } - } + double DH[dim]; + for (int i = 0; i < dim; i++) + DH[i] = getdX(i); + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); + CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false); + const int *owner_rank = plan.owner_rank.data(); - interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block); + interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); // Replace MPI_Allreduce with per-owner MPI_Bcast: // Group consecutive points by owner rank and broadcast each group. @@ -788,8 +895,6 @@ void Patch::Interp_Points(MyList *VarList, } } - delete[] owner_rank; - delete[] owner_block; } void Patch::Interp_Points(MyList *VarList, int NN, double **XX, @@ -818,98 +923,22 @@ void Patch::Interp_Points(MyList *VarList, memset(Shellf, 0, sizeof(double) * NN * num_var); - // owner_rank[j] records which MPI rank owns point j - int *owner_rank; - owner_rank = new int[NN]; - int *owner_block; - owner_block = new int[NN]; - for (int j = 0; j < NN; j++) - { - owner_rank[j] = -1; - owner_block[j] = -1; - } - - double DH[dim]; - for (int i = 0; i < dim; i++) - DH[i] = getdX(i); - BlockBinIndex block_index; - build_block_bin_index(this, DH, block_index); - - // --- Interpolation phase (identical to original) --- - for (int j = 0; j < NN; j++) - { - double pox[dim]; - for (int i = 0; i < dim; i++) - { - pox[i] = XX[i][j]; - if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i])) - { - cout << "Patch::Interp_Points: point ("; - for (int k = 0; k < dim; k++) - { - cout << XX[k][j]; - if (k < dim - 1) - cout << ","; - else - cout << ") is out of current Patch." << endl; - } - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - owner_block[j] = block_i; - } - } + double DH[dim]; + for (int i = 0; i < dim; i++) + DH[i] = getdX(i); + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); + CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, myrank == 0, false); + const int *owner_rank = plan.owner_rank.data(); - interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block); + interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); #ifdef INTERP_LB_PROFILE double t_interp_end = MPI_Wtime(); double t_interp_local = t_interp_end - t_interp_start; #endif - // --- Error check for unfound points --- - for (int j = 0; j < NN; j++) - { - if (owner_rank[j] < 0 && myrank == 0) - { - cout << "ERROR: Patch::Interp_Points fails to find point ("; - for (int d = 0; d < dim; d++) - { - cout << XX[d][j]; - if (d < dim - 1) - cout << ","; - else - cout << ")"; - } - cout << " on Patch ("; - for (int d = 0; d < dim; d++) - { - cout << bbox[d] << "+" << lli[d] * DH[d]; - if (d < dim - 1) - cout << ","; - else - cout << ")--"; - } - cout << "("; - for (int d = 0; d < dim; d++) - { - cout << bbox[dim + d] << "-" << uui[d] * DH[d]; - if (d < dim - 1) - cout << ","; - else - cout << ")" << endl; - } - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - // --- Targeted point-to-point communication phase --- + // --- Targeted point-to-point communication phase --- // Compute consumer_rank[j] using the same deterministic formula as surface_integral int *consumer_rank = new int[NN]; { @@ -1028,8 +1057,6 @@ void Patch::Interp_Points(MyList *VarList, delete[] send_count; delete[] recv_count; delete[] consumer_rank; - delete[] owner_rank; - delete[] owner_block; #ifdef INTERP_LB_PROFILE { @@ -1077,59 +1104,20 @@ void Patch::Interp_Points(MyList *VarList, memset(Shellf, 0, sizeof(double) * NN * num_var); - // owner_rank[j] stores the global rank that owns point j - int *owner_rank; - owner_rank = new int[NN]; - int *owner_block; - owner_block = new int[NN]; - for (int j = 0; j < NN; j++) - { - owner_rank[j] = -1; - owner_block[j] = -1; - } + // Build global-to-local rank translation for Comm_here + MPI_Group world_group, local_group; + MPI_Comm_group(MPI_COMM_WORLD, &world_group); + MPI_Comm_group(Comm_here, &local_group); - // Build global-to-local rank translation for Comm_here - MPI_Group world_group, local_group; - MPI_Comm_group(MPI_COMM_WORLD, &world_group); - MPI_Comm_group(Comm_here, &local_group); - - double DH[dim]; - for (int i = 0; i < dim; i++) - DH[i] = getdX(i); - BlockBinIndex block_index; - build_block_bin_index(this, DH, block_index); - - for (int j = 0; j < NN; j++) // run along points - { - double pox[dim]; - for (int i = 0; i < dim; i++) - { - pox[i] = XX[i][j]; - if (lmyrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i])) - { - cout << "Patch::Interp_Points: point ("; - for (int k = 0; k < dim; k++) - { - cout << XX[k][j]; - if (k < dim - 1) - cout << ","; - else - cout << ") is out of current Patch." << endl; - } - MPI_Abort(MPI_COMM_WORLD, 1); - } - } - - const int block_i = find_block_index_for_point(block_index, pox, DH); - if (block_i >= 0) - { - Block *BP = block_index.views[block_i].bp; - owner_rank[j] = BP->rank; - owner_block[j] = block_i; - } - } + double DH[dim]; + for (int i = 0; i < dim; i++) + DH[i] = getdX(i); + BlockBinIndex block_index; + build_block_bin_index(this, DH, block_index); + CachedInterpPlan &plan = get_cached_interp_plan(this, NN, XX, Symmetry, myrank, DH, block_index, lmyrank == 0, true); + const int *owner_rank = plan.owner_rank.data(); - interpolate_owned_points(VarList, NN, XX, Shellf, Symmetry, myrank, ordn, block_index, owner_rank, owner_block); + interpolate_owned_points(VarList, Shellf, Symmetry, ordn, block_index, plan); // Collect unique global owner ranks and translate to local ranks in Comm_here // Then broadcast each owner's points via MPI_Bcast on Comm_here @@ -1159,8 +1147,6 @@ void Patch::Interp_Points(MyList *VarList, MPI_Group_free(&world_group); MPI_Group_free(&local_group); - delete[] owner_rank; - delete[] owner_block; } void Patch::checkBlock() { diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index e8f6aa4..6ed08ee 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -953,14 +953,36 @@ int bssn_cuda_interp_points_batch(const int *ex, struct InterpBatchCache { + struct StencilCacheEntry + { + const double *X; + const double *Y; + const double *Z; + const double *px; + const double *py; + const double *pz; + int nx; + int ny; + int nz; + int num_points; + int ordn; + int symmetry; + CachedBuffer weights; + CachedIntBuffer indices; + CachedIntBuffer reflect; + + StencilCacheEntry() + : X(nullptr), Y(nullptr), Z(nullptr), + px(nullptr), py(nullptr), pz(nullptr), + nx(0), ny(0), nz(0), num_points(0), ordn(0), symmetry(0) {} + }; + CachedBuffer out; CachedBuffer soa; CachedBuffer field_ptrs; - CachedBuffer weights; - CachedIntBuffer indices; - CachedIntBuffer reflect; CachedIntBuffer error_flag; std::vector host_field_copies; + std::vector stencil_entries; }; static thread_local InterpBatchCache cache; @@ -978,45 +1000,82 @@ int bssn_cuda_interp_points_batch(const int *ex, const size_t indices_bytes = point_stencil_ints * sizeof(int); bool ok = true; - std::vector host_weights(point_stencil_doubles); - std::vector host_indices(point_stencil_ints); - std::vector host_reflect(point_stencil_ints); - const double dx = X[1] - X[0]; - const double dy = Y[1] - Y[0]; - const double dz = Z[1] - Z[0]; - const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx); - const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy); - const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz); - for (int p = 0; p < num_points; ++p) + InterpBatchCache::StencilCacheEntry *stencil_cache = nullptr; + for (size_t i = 0; i < cache.stencil_entries.size(); ++i) { - int start_x = 0, start_y = 0, start_z = 0; - double cx = 0.0, cy = 0.0, cz = 0.0; - const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx); - const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy); - const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz); - if (!ok_x || !ok_y || !ok_z) - return 1; - - lagrange_weights_ord6_host(cx, host_weights.data() + p * 18); - lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6); - lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12); - - for (int i = 0; i < 6; ++i) + InterpBatchCache::StencilCacheEntry &entry = cache.stencil_entries[i]; + if (entry.X == X && entry.Y == Y && entry.Z == Z && + entry.px == px && entry.py == py && entry.pz == pz && + entry.nx == nx && entry.ny == ny && entry.nz == nz && + entry.num_points == num_points && entry.ordn == ordn && + entry.symmetry == symmetry) { - if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i])) - return 1; - if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i])) - return 1; - if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i])) - return 1; + stencil_cache = &entry; + break; } } + if (!stencil_cache) + { + cache.stencil_entries.push_back(InterpBatchCache::StencilCacheEntry()); + stencil_cache = &cache.stencil_entries.back(); + stencil_cache->X = X; + stencil_cache->Y = Y; + stencil_cache->Z = Z; + stencil_cache->px = px; + stencil_cache->py = py; + stencil_cache->pz = pz; + stencil_cache->nx = nx; + stencil_cache->ny = ny; + stencil_cache->nz = nz; + stencil_cache->num_points = num_points; + stencil_cache->ordn = ordn; + stencil_cache->symmetry = symmetry; + + std::vector host_weights(point_stencil_doubles); + std::vector host_indices(point_stencil_ints); + std::vector host_reflect(point_stencil_ints); + const double dx = X[1] - X[0]; + const double dy = Y[1] - Y[0]; + const double dz = Z[1] - Z[0]; + const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx); + const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy); + const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz); + for (int p = 0; p < num_points; ++p) + { + int start_x = 0, start_y = 0, start_z = 0; + double cx = 0.0, cy = 0.0, cz = 0.0; + const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx); + const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy); + const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz); + if (!ok_x || !ok_y || !ok_z) + return 1; + + lagrange_weights_ord6_host(cx, host_weights.data() + p * 18); + lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6); + lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12); + + for (int i = 0; i < 6; ++i) + { + if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i])) + return 1; + if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i])) + return 1; + if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i])) + return 1; + } + } + + ok = ok && + copy_to_device(stencil_cache->weights, host_weights.data(), weights_bytes) && + copy_to_device(stencil_cache->indices, host_indices.data(), indices_bytes) && + copy_to_device(stencil_cache->reflect, host_reflect.data(), indices_bytes); + if (!ok) + return 1; + } + ok = ok && copy_to_device(cache.soa, soa_flat, soa_bytes) && - copy_to_device(cache.weights, host_weights.data(), weights_bytes) && - copy_to_device(cache.indices, host_indices.data(), indices_bytes) && - copy_to_device(cache.reflect, host_reflect.data(), indices_bytes) && ensure_capacity(cache.out, out_bytes) && ensure_capacity(cache.field_ptrs, ptr_bytes) && ensure_capacity(cache.error_flag, sizeof(int)); @@ -1063,9 +1122,9 @@ int bssn_cuda_interp_points_batch(const int *ex, int ny_local = ny; const double *dsoa = cache.soa.ptr; const double *const *dfields = reinterpret_cast(cache.field_ptrs.ptr); - const double *dweights = cache.weights.ptr; - const int *dindices = cache.indices.ptr; - const int *dreflect = cache.reflect.ptr; + const double *dweights = stencil_cache->weights.ptr; + const int *dindices = stencil_cache->indices.ptr; + const int *dreflect = stencil_cache->reflect.ptr; double *dout = cache.out.ptr; int *derror = cache.error_flag.ptr; diff --git a/AMSS_NCKU_source/surface_integral.C b/AMSS_NCKU_source/surface_integral.C index c2b7b67..c874fc4 100644 --- a/AMSS_NCKU_source/surface_integral.C +++ b/AMSS_NCKU_source/surface_integral.C @@ -180,19 +180,64 @@ surface_integral::surface_integral(int iSymmetry) : Symmetry(iSymmetry) //|============================================================================ //| Destructor //|============================================================================ -surface_integral::~surface_integral() -{ - delete[] nx_g; - delete[] ny_g; - delete[] nz_g; - delete[] arcostheta; -#ifdef GaussInt - delete[] wtcostheta; -#endif -} -//|---------------------------------------------------------------- -// spin weighted spinw component of psi4, general routine -// l takes from spinw to maxl; m takes from -l to l +surface_integral::~surface_integral() +{ + release_cached_buffers(); + delete[] nx_g; + delete[] ny_g; + delete[] nz_g; + delete[] arcostheta; +#ifdef GaussInt + delete[] wtcostheta; +#endif +} + +void surface_integral::get_surface_points(double rex, double **pox) +{ + SpherePointCache &cache = sphere_point_cache[rex]; + if (!cache.pox[0]) + { + for (int i = 0; i < 3; ++i) + cache.pox[i] = new double[n_tot]; + for (int n = 0; n < n_tot; ++n) + { + cache.pox[0][n] = rex * nx_g[n]; + cache.pox[1][n] = rex * ny_g[n]; + cache.pox[2][n] = rex * nz_g[n]; + } + } + + pox[0] = cache.pox[0]; + pox[1] = cache.pox[1]; + pox[2] = cache.pox[2]; +} + +double *surface_integral::get_shellf_buffer(int num_var) +{ + double *&buffer = shellf_cache[num_var]; + if (!buffer) + buffer = new double[n_tot * num_var]; + return buffer; +} + +void surface_integral::release_cached_buffers() +{ + for (map::iterator it = sphere_point_cache.begin(); it != sphere_point_cache.end(); ++it) + { + delete[] it->second.pox[0]; + delete[] it->second.pox[1]; + delete[] it->second.pox[2]; + it->second.pox[0] = it->second.pox[1] = it->second.pox[2] = 0; + } + sphere_point_cache.clear(); + + for (map::iterator it = shellf_cache.begin(); it != shellf_cache.end(); ++it) + delete[] it->second; + shellf_cache.clear(); +} +//|---------------------------------------------------------------- +// spin weighted spinw component of psi4, general routine +// l takes from spinw to maxl; m takes from -l to l //|---------------------------------------------------------------- void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP, @@ -209,16 +254,9 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * MyList *DG_List = new MyList(Rpsi4); DG_List->insert(Ipsi4); - int n; - double *pox[3]; - for (int i = 0; i < 3; i++) - pox[i] = new double[n_tot]; - for (n = 0; n < n_tot; n++) - { - pox[0][n] = rex * nx_g[n]; - pox[1][n] = rex * ny_g[n]; - pox[2][n] = rex * nz_g[n]; - } + int n; + double *pox[3]; + get_surface_points(rex, pox); int mp, Lp, Nmin, Nmax; mp = n_tot / cpusize; @@ -234,8 +272,7 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * Nmax = Nmin + mp - 1; } - double *shellf; - shellf = new double[n_tot * InList]; + double *shellf = get_shellf_buffer(InList); GH->PatL[lev]->data->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry, Nmin, Nmax); @@ -375,14 +412,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * //|------= Free memory. - delete[] pox[0]; - delete[] pox[1]; - delete[] pox[2]; - delete[] shellf; - delete[] RP_out; - delete[] IP_out; - DG_List->clearList(); -} + delete[] RP_out; + delete[] IP_out; + DG_List->clearList(); +} void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4, int spinw, int maxl, int NN, double *RP, double *IP, monitor *Monitor, MPI_Comm Comm_here) // NN is the length of RP and IP @@ -402,19 +435,11 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * MyList *DG_List = new MyList(Rpsi4); DG_List->insert(Ipsi4); - int n; - double *pox[3]; - for (int i = 0; i < 3; i++) - pox[i] = new double[n_tot]; - for (n = 0; n < n_tot; n++) - { - pox[0][n] = rex * nx_g[n]; - pox[1][n] = rex * ny_g[n]; - pox[2][n] = rex * nz_g[n]; - } - - double *shellf; - shellf = new double[n_tot * InList]; + int n; + double *pox[3]; + get_surface_points(rex, pox); + + double *shellf = get_shellf_buffer(InList); // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"before Interp_Points"); @@ -577,14 +602,10 @@ void surface_integral::surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var * //|------= Free memory. - delete[] pox[0]; - delete[] pox[1]; - delete[] pox[2]; - delete[] shellf; - delete[] RP_out; - delete[] IP_out; - DG_List->clearList(); -} + delete[] RP_out; + delete[] IP_out; + DG_List->clearList(); +} //|---------------------------------------------------------------- // for shell patch //|---------------------------------------------------------------- @@ -597,19 +618,11 @@ void surface_integral::surf_Wave(double rex, int lev, ShellPatch *GH, var *Rpsi4 MyList *DG_List = new MyList(Rpsi4); DG_List->insert(Ipsi4); - int n; - double *pox[3]; - for (int i = 0; i < 3; i++) - pox[i] = new double[n_tot]; - for (n = 0; n < n_tot; n++) - { - pox[0][n] = rex * nx_g[n]; - pox[1][n] = rex * ny_g[n]; - pox[2][n] = rex * nz_g[n]; - } + int n; + double *pox[3]; + get_surface_points(rex, pox); - double *shellf; - shellf = new double[n_tot * InList]; + double *shellf = get_shellf_buffer(InList); GH->Interp_Points(DG_List, n_tot, pox, shellf, Symmetry); @@ -2570,12 +2583,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var Rout[5] = sy; Rout[6] = sz; - delete[] pox[0]; - delete[] pox[1]; - delete[] pox[2]; - delete[] shellf; - DG_List->clearList(); -} + DG_List->clearList(); +} void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var *trK, var *gxx, var *gxy, var *gxz, var *gyy, var *gyz, var *gzz, var *Axx, var *Axy, var *Axz, var *Ayy, var *Ayz, var *Azz, @@ -2637,19 +2646,11 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var DG_List->insert(Ayz); DG_List->insert(Azz); - int n; - double *pox[3]; - for (int i = 0; i < 3; i++) - pox[i] = new double[n_tot]; - for (n = 0; n < n_tot; n++) - { - pox[0][n] = rex * nx_g[n]; - pox[1][n] = rex * ny_g[n]; - pox[2][n] = rex * nz_g[n]; - } - - double *shellf; - shellf = new double[n_tot * InList]; + int n; + double *pox[3]; + get_surface_points(rex, pox); + + double *shellf = get_shellf_buffer(InList); // we have assumed there is only one box on this level, // so we do not need loop boxes @@ -2839,12 +2840,8 @@ void surface_integral::surf_MassPAng(double rex, int lev, cgh *GH, var *chi, var Rout[5] = sy; Rout[6] = sz; - delete[] pox[0]; - delete[] pox[1]; - delete[] pox[2]; - delete[] shellf; - DG_List->clearList(); -} + DG_List->clearList(); +} //|---------------------------------------------------------------- // for shell patch //|---------------------------------------------------------------- diff --git a/AMSS_NCKU_source/surface_integral.h b/AMSS_NCKU_source/surface_integral.h index c36f245..deb8f2f 100644 --- a/AMSS_NCKU_source/surface_integral.h +++ b/AMSS_NCKU_source/surface_integral.h @@ -20,25 +20,41 @@ using namespace std; #include "cgh.h" #include "ShellPatch.h" #include "NullShellPatch.h" -#include "NullShellPatch2.h" -#include "var.h" -#include "monitor.h" +#include "NullShellPatch2.h" +#include "var.h" +#include "monitor.h" +#include class surface_integral { -private: - int Symmetry, factor; - int N_theta, N_phi; // Number of points in Theta & Phi directions - double dphi, dcostheta; - double *arcostheta, *wtcostheta; - int n_tot; // size of arrays - - double *nx_g, *ny_g, *nz_g; // global list of unit normals - int myrank, cpusize; - -public: - surface_integral(int iSymmetry); +private: + struct SpherePointCache + { + double *pox[3]; + SpherePointCache() + { + pox[0] = pox[1] = pox[2] = 0; + } + }; + + int Symmetry, factor; + int N_theta, N_phi; // Number of points in Theta & Phi directions + double dphi, dcostheta; + double *arcostheta, *wtcostheta; + int n_tot; // size of arrays + + double *nx_g, *ny_g, *nz_g; // global list of unit normals + int myrank, cpusize; + map sphere_point_cache; + map shellf_cache; + + void get_surface_points(double rex, double **pox); + double *get_shellf_buffer(int num_var); + void release_cached_buffers(); + +public: + surface_integral(int iSymmetry); ~surface_integral(); void surf_Wave(double rex, int lev, cgh *GH, var *Rpsi4, var *Ipsi4,