diff --git a/AMSS_NCKU_source/ShellPatch.C b/AMSS_NCKU_source/ShellPatch.C index 2f092d5..a2f5843 100644 --- a/AMSS_NCKU_source/ShellPatch.C +++ b/AMSS_NCKU_source/ShellPatch.C @@ -5,24 +5,971 @@ #include #include #include -#include -#include -using namespace std; +#include +#include +#include +#include +#include +#include +using namespace std; #include "ShellPatch.h" #include "Parallel.h" #include "fmisc.h" #include "misc.h" #include "shellfunctions.h" -#include "parameters.h" - -#define PI M_PI +#include "parameters.h" + +#define PI M_PI + +#if USE_CUDA_BSSN +extern "C" int bssn_cuda_shell_pack_host_fields(int npoints, + int nvars, + int nblocks, + int ordn, + double **block_var_fields, + int *block_shapes, + int *point_block, + int *point_dimh, + int *point_dumyd, + int *point_sind, + double *point_coef, + double *out); +extern "C" void bssn_cuda_shell_pack_cache_begin(); +extern "C" void bssn_cuda_shell_pack_cache_end(); +#endif // x x x x x o * // * o x x x x x // each side contribute an overlap points // so we need half of that -#define overghost ((ghost_width + 1) / 2 + ghost_width) +#define overghost ((ghost_width + 1) / 2 + ghost_width) + +namespace { + +bool shell_fast_interp_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_FAST_INTERP"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool shell_parallel_interp_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_PARALLEL_INTERP"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool shell_cuda_interp_enabled() +{ +#if USE_CUDA_BSSN + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_CUDA_INTERP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +#else + return false; +#endif +} + +bool shell_interp_stats_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_INTERP_STATS"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool shell_transfer_timing_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_SHELL_TRANSFER_TIMING"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +int shell_interp_threads() +{ + static int threads = -1; + if (threads < 0) + { + const char *env = getenv("AMSS_SHELL_INTERP_THREADS"); + int requested = env ? atoi(env) : 0; + if (requested <= 0) + { + unsigned int hw = std::thread::hardware_concurrency(); + requested = hw > 0 ? (int)hw : 8; + requested = std::min(requested, 32); + } + threads = std::max(1, requested); + } + return threads; +} + +int shell_var_count(MyList *VarList) +{ + int count = 0; + while (VarList) + { + ++count; + VarList = VarList->next; + } + return count; +} + +struct ShellInterpStats +{ + long long pack_dim3 = 0; + long long pack_dim2 = 0; + long long pack_dim1 = 0; + long long fast_dim3 = 0; + long long fast_dim2 = 0; + long long fast_dim1 = 0; + long long fallback_dim3 = 0; + long long fallback_dim2 = 0; + long long fallback_dim1 = 0; +}; + +ShellInterpStats shell_interp_stats; + +void shell_note_pack(int dimh, bool fast_done) +{ + if (!shell_interp_stats_enabled()) + return; + if (dimh == 3) + { + shell_interp_stats.pack_dim3++; + fast_done ? shell_interp_stats.fast_dim3++ : shell_interp_stats.fallback_dim3++; + } + else if (dimh == 2) + { + shell_interp_stats.pack_dim2++; + fast_done ? shell_interp_stats.fast_dim2++ : shell_interp_stats.fallback_dim2++; + } + else if (dimh == 1) + { + shell_interp_stats.pack_dim1++; + fast_done ? shell_interp_stats.fast_dim1++ : shell_interp_stats.fallback_dim1++; + } +} + +void shell_print_interp_stats(const char *label) +{ + if (!shell_interp_stats_enabled()) + return; + + int rank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + cout << "[AMSS-SHELL-INTERP-STATS] rank=" << rank << " " << label + << " pack3=" << shell_interp_stats.pack_dim3 + << " fast3=" << shell_interp_stats.fast_dim3 + << " fallback3=" << shell_interp_stats.fallback_dim3 + << " pack2=" << shell_interp_stats.pack_dim2 + << " fast2=" << shell_interp_stats.fast_dim2 + << " fallback2=" << shell_interp_stats.fallback_dim2 + << " pack1=" << shell_interp_stats.pack_dim1 + << " fast1=" << shell_interp_stats.fast_dim1 + << " fallback1=" << shell_interp_stats.fallback_dim1 + << endl; +} + +inline int shell_idx3(const int *shape, int i, int j, int k) +{ + return i + shape[0] * (j + shape[1] * k); +} + +bool shell_reflect_index(int idx, int n, int ordn, int &mapped, bool &reflected) +{ + reflected = false; + if (idx >= 0 && idx < n) + { + mapped = idx; + return true; + } + if (idx < 0) + { + if (idx < -ordn) + return false; + mapped = -idx; + reflected = true; + return mapped >= 0 && mapped < n; + } + if (idx <= n + ordn - 1) + { + mapped = 2 * n - 2 - idx; + reflected = true; + return mapped >= 0 && mapped < n; + } + return false; +} + +void shell_symmetry_soa3(const var *vp, int sst, double soa[3]) +{ + if (sst == 2 || sst == 3) + { + soa[0] = vp->SoA[1]; + soa[1] = vp->SoA[2]; + soa[2] = 0.0; + } + else if (sst == 4 || sst == 5) + { + soa[0] = vp->SoA[0]; + soa[1] = vp->SoA[2]; + soa[2] = 0.0; + } + else + { + soa[0] = vp->SoA[0]; + soa[1] = vp->SoA[1]; + soa[2] = (sst == -1) ? vp->SoA[2] : 0.0; + } +} + +double shell_symmetry_soa1(const var *vp, int sst, int dumyd) +{ + if (dumyd == 1) + { + if (sst == 2 || sst == 3) + return vp->SoA[1]; + return vp->SoA[0]; + } + if (dumyd == 0) + { + if (sst == 0 || sst == 1) + return vp->SoA[1]; + return vp->SoA[2]; + } + return 1.0; +} + +bool shell_interp3d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn) +{ + const int *shape = pt->Bg->shape; + const int *s = pt->sind; + if (!s || !pt->coef) + return false; + if (s[0] < 0 || s[1] < 0 || s[2] < 0 || + s[0] + ordn > shape[0] || + s[1] + ordn > shape[1] || + s[2] + ordn > shape[2]) + return false; + + const double *f = pt->Bg->fgfs[vp->sgfn]; + const double *cx = pt->coef; + const double *cy = pt->coef + ordn; + const double *cz = pt->coef + 2 * ordn; + double sum = 0.0; + for (int k = 0; k < ordn; ++k) + for (int j = 0; j < ordn; ++j) + for (int i = 0; i < ordn; ++i) + sum += cx[i] * cy[j] * cz[k] * + f[shell_idx3(shape, s[0] + i, s[1] + j, s[2] + k)]; + out = sum; + return true; +} + +bool shell_interp2d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn) +{ + const int *shape = pt->Bg->shape; + const int *s = pt->sind; + if (!s || !pt->coef) + return false; + const int k0 = s[2] - 1; // Match global_interpind2d's Fortran fixed-index convention. + if (s[0] < 0 || s[1] < 0 || + s[0] + ordn > shape[0] || + s[1] + ordn > shape[1] || + k0 < 0 || k0 >= shape[2]) + return false; + + const double *f = pt->Bg->fgfs[vp->sgfn]; + const double *cx = pt->coef; + const double *cy = pt->coef + ordn; + double sum = 0.0; + for (int j = 0; j < ordn; ++j) + for (int i = 0; i < ordn; ++i) + sum += cx[i] * cy[j] * f[shell_idx3(shape, s[0] + i, s[1] + j, k0)]; + out = sum; + return true; +} + +bool shell_interp1d_fast(const ShellPatch::pointstru *pt, const var *vp, double &out, int ordn) +{ + const int *shape = pt->Bg->shape; + const int *s = pt->sind; + if (!s || !pt->coef) + return false; + + const double *f = pt->Bg->fgfs[vp->sgfn]; + double sum = 0.0; + if (pt->dumyd == 1) + { + if (s[0] < 0 || s[0] + ordn > shape[0] || + s[1] < 0 || s[1] >= shape[1] || + s[2] < 0 || s[2] >= shape[2]) + return false; + for (int i = 0; i < ordn; ++i) + sum += pt->coef[i] * f[shell_idx3(shape, s[0] + i, s[1], s[2])]; + } + else if (pt->dumyd == 0) + { + if (s[0] < 0 || s[0] + ordn > shape[1] || + s[1] < 0 || s[1] >= shape[0] || + s[2] < 0 || s[2] >= shape[2]) + return false; + for (int j = 0; j < ordn; ++j) + sum += pt->coef[j] * f[shell_idx3(shape, s[1], s[0] + j, s[2])]; + } + else + return false; + + out = sum; + return true; +} + +bool shell_interp_fast_possible(const ShellPatch::pointstru *pt, int ordn) +{ + const int DIMh = (pt->dumyd == -1) ? dim : 1; + const int *shape = pt->Bg->shape; + const int *s = pt->sind; + if (!s || !pt->coef) + return false; + + if (DIMh == 3) + { + if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 || + s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 || + s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1) + return false; + if (pt->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2])) + return false; + return true; + } + + if (DIMh == 1) + { + if (pt->dumyd == 1) + return s[0] >= -ordn && s[0] + ordn - 1 <= shape[0] + ordn - 1 && + s[1] >= 0 && s[1] < shape[1] && + s[2] >= 0 && s[2] < shape[2]; + if (pt->dumyd == 0) + return s[0] >= -ordn && s[0] + ordn - 1 <= shape[1] + ordn - 1 && + s[1] >= 0 && s[1] < shape[0] && + s[2] >= 0 && s[2] < shape[2]; + } + + return false; +} + +bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx); + +bool shell_pointcopy_fast(const ShellPatch::pointstru *pt, const var *vp, double value) +{ + int idx = -1; + if (!shell_pointcopy_index(pt, idx)) + return false; + pt->Bg->fgfs[vp->sgfn][idx] = value; + return true; +} + +bool shell_pointcopy_index(const ShellPatch::pointstru *pt, int &idx) +{ + Block *bg = pt->Bg; + const int *shape = bg->shape; + if (shape[0] <= 1 || shape[1] <= 1 || shape[2] <= 1) + return false; + double h[3]; + for (int d = 0; d < dim; ++d) +#ifdef Vertex + h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / (shape[d] - 1); +#else + h[d] = (bg->bbox[dim + d] - bg->bbox[d]) / shape[d]; +#endif + int i = int((pt->lpox[0] - bg->bbox[0]) / h[0] + 0.4); + int j = int((pt->lpox[1] - bg->bbox[1]) / h[1] + 0.4); + int k = int((pt->lpox[2] - bg->bbox[2]) / h[2] + 0.4); + if (i < 0 || i >= shape[0] || j < 0 || j >= shape[1] || k < 0 || k >= shape[2]) + return false; + idx = shell_idx3(shape, i, j, k); + return true; +} + +bool shell_pointcopy_possible(const ShellPatch::pointstru *pt) +{ + int idx = -1; + return shell_pointcopy_index(pt, idx); +} + +bool shell_pack3d_fast_all(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int *shape = src->Bg->shape; + const int *s = src->sind; + if (!s || !src->coef) + return false; + if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 || + s[1] < -ordn || s[1] + ordn - 1 > shape[1] + ordn - 1 || + s[2] < -ordn || s[2] + ordn - 1 > shape[2] + ordn - 1) + return false; + if (src->ssst != -1 && (s[2] < 0 || s[2] + ordn > shape[2])) + return false; + + const int nst = ordn * ordn * ordn; + if (ordn <= 0 || nst > 1728) + return false; + double weights[1728]; + int offsets[1728]; + unsigned char reflect_mask[1728]; + const double *cx = src->coef; + const double *cy = src->coef + ordn; + const double *cz = src->coef + 2 * ordn; + int q = 0; + for (int k = 0; k < ordn; ++k) + { + int mk = 0; + bool rk = false; + if (!shell_reflect_index(s[2] + k, shape[2], ordn, mk, rk)) + return false; + const double wz = cz[k]; + const int zk = mk * shape[0] * shape[1]; + for (int j = 0; j < ordn; ++j) + { + int mj = 0; + bool rj = false; + if (!shell_reflect_index(s[1] + j, shape[1], ordn, mj, rj)) + return false; + const double wyz = cy[j] * wz; + const int yj = mj * shape[0]; + for (int i = 0; i < ordn; ++i) + { + int mi = 0; + bool ri = false; + if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri)) + return false; + weights[q] = cx[i] * wyz; + offsets[q] = mi + yj + zk; + reflect_mask[q] = (ri ? 1 : 0) | (rj ? 2 : 0) | (rk ? 4 : 0); + ++q; + } + } + } + + for (size_t vi = 0; vi < vars.size(); ++vi) + { + const double *f = src->Bg->fgfs[vars[vi]->sgfn]; + double soa[3]; + shell_symmetry_soa3(vars[vi], src->ssst, soa); + double sum = 0.0; + for (int p = 0; p < nst; ++p) + { + double fac = 1.0; + if (reflect_mask[p] & 1) fac *= soa[0]; + if (reflect_mask[p] & 2) fac *= soa[1]; + if (reflect_mask[p] & 4) fac *= soa[2]; + sum += weights[p] * fac * f[offsets[p]]; + } + data[out_base + (int)vi] = sum; + shell_note_pack(3, true); + } + return true; +} + +bool shell_pack2d_fast_all(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int *shape = src->Bg->shape; + const int *s = src->sind; + if (!s || !src->coef) + return false; + const int k0 = s[2] - 1; + if (s[0] < 0 || s[1] < 0 || + s[0] + ordn > shape[0] || + s[1] + ordn > shape[1] || + k0 < 0 || k0 >= shape[2]) + return false; + + const int nst = ordn * ordn; + if (ordn <= 0 || nst > 144) + return false; + double weights[144]; + int offsets[144]; + const double *cx = src->coef; + const double *cy = src->coef + ordn; + int q = 0; + const int zk = k0 * shape[0] * shape[1]; + for (int j = 0; j < ordn; ++j) + { + const int yj = (s[1] + j) * shape[0]; + for (int i = 0; i < ordn; ++i) + { + weights[q] = cx[i] * cy[j]; + offsets[q] = s[0] + i + yj + zk; + ++q; + } + } + + for (size_t vi = 0; vi < vars.size(); ++vi) + { + const double *f = src->Bg->fgfs[vars[vi]->sgfn]; + double sum = 0.0; + for (int p = 0; p < nst; ++p) + sum += weights[p] * f[offsets[p]]; + data[out_base + (int)vi] = sum; + shell_note_pack(2, true); + } + return true; +} + +bool shell_pack1d_fast_all(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int *shape = src->Bg->shape; + const int *s = src->sind; + if (!s || !src->coef) + return false; + if (ordn <= 0 || ordn > 12) + return false; + + int offsets[12]; + unsigned char reflected[12]; + if (src->dumyd == 1) + { + if (s[0] < -ordn || s[0] + ordn - 1 > shape[0] + ordn - 1 || + s[1] < 0 || s[1] >= shape[1] || + s[2] < 0 || s[2] >= shape[2]) + return false; + const int base = s[1] * shape[0] + s[2] * shape[0] * shape[1]; + for (int i = 0; i < ordn; ++i) + { + int mi = 0; + bool ri = false; + if (!shell_reflect_index(s[0] + i, shape[0], ordn, mi, ri)) + return false; + offsets[i] = mi + base; + reflected[i] = ri ? 1 : 0; + } + } + else if (src->dumyd == 0) + { + if (s[0] < -ordn || s[0] + ordn - 1 > shape[1] + ordn - 1 || + s[1] < 0 || s[1] >= shape[0] || + s[2] < 0 || s[2] >= shape[2]) + return false; + const int base = s[1] + s[2] * shape[0] * shape[1]; + for (int j = 0; j < ordn; ++j) + { + int mj = 0; + bool rj = false; + if (!shell_reflect_index(s[0] + j, shape[1], ordn, mj, rj)) + return false; + offsets[j] = base + mj * shape[0]; + reflected[j] = rj ? 1 : 0; + } + } + else + return false; + + for (size_t vi = 0; vi < vars.size(); ++vi) + { + const double *f = src->Bg->fgfs[vars[vi]->sgfn]; + const double soa = shell_symmetry_soa1(vars[vi], src->ssst, src->dumyd); + double sum = 0.0; + for (int p = 0; p < ordn; ++p) + sum += src->coef[p] * (reflected[p] ? soa : 1.0) * f[offsets[p]]; + data[out_base + (int)vi] = sum; + shell_note_pack(1, true); + } + return true; +} + +struct ShellPointPair +{ + ShellPatch::pointstru *src; + ShellPatch::pointstru *dst; +}; + +struct ShellActiveCache +{ + MyList *src = 0; + MyList *dst = 0; + MyList *var_src = 0; + MyList *var_dst = 0; + int rank_in = -1; + int dir = -1; + int myrank = -1; + bool valid = false; + int ordn = -1; + std::vector src_vars; + std::vector dst_vars; + std::vector active_points; + std::vector fast_points; + std::vector dst_indices; +}; + +std::vector shell_active_caches; + +void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn); + +bool shell_active_cache_matches(MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *var_src, + MyList *var_dst, + int myrank, + int ordn, + const ShellActiveCache &cache) +{ + return cache.valid && + cache.src == src && + cache.dst == dst && + cache.rank_in == rank_in && + cache.dir == dir && + cache.var_src == var_src && + cache.var_dst == var_dst && + cache.myrank == myrank && + cache.ordn == ordn; +} + +ShellActiveCache &shell_get_active_cache(MyList *src, + MyList *dst, + int rank_in, + int dir, + MyList *VarLists, + MyList *VarListd, + int myrank, + int ordn) +{ + for (size_t c = 0; c < shell_active_caches.size(); ++c) + if (shell_active_cache_matches(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn, shell_active_caches[c])) + return shell_active_caches[c]; + + shell_active_caches.push_back(ShellActiveCache()); + ShellActiveCache &cache = shell_active_caches.back(); + cache.src = src; + cache.dst = dst; + cache.var_src = VarLists; + cache.var_dst = VarListd; + cache.rank_in = rank_in; + cache.dir = dir; + cache.myrank = myrank; + cache.ordn = ordn; + cache.valid = true; + + for (MyList *varls = VarLists, *varld = VarListd; + varls && varld; + varls = varls->next, varld = varld->next) + { + cache.src_vars.push_back(varls->data); + cache.dst_vars.push_back(varld->data); + } + + MyList *src_scan = src; + MyList *dst_scan = dst; + while (src_scan && dst_scan) + { + if ((dir == PACK && dst_scan->data->Bg->rank == rank_in && src_scan->data->Bg->rank == myrank) || + (dir == UNPACK && src_scan->data->Bg->rank == rank_in && dst_scan->data->Bg->rank == myrank)) + { + ShellPointPair pair; + pair.src = src_scan->data; + pair.dst = dst_scan->data; + cache.active_points.push_back(pair); + } + src_scan = src_scan->next; + dst_scan = dst_scan->next; + } + + cache.fast_points.assign(cache.active_points.size(), 0); + cache.dst_indices.assign(cache.active_points.size(), -1); + if (dir == PACK && shell_fast_interp_enabled()) + { + for (size_t p = 0; p < cache.active_points.size(); ++p) + { + shell_prepare_interp_coeffs(cache.active_points[p].src, ordn); + cache.fast_points[p] = shell_interp_fast_possible(cache.active_points[p].src, ordn) ? 1 : 0; + } + } + else if (dir == UNPACK) + { + for (size_t p = 0; p < cache.active_points.size(); ++p) + { + int idx = -1; + if (shell_pointcopy_index(cache.active_points[p].dst, idx)) + { + cache.fast_points[p] = 1; + cache.dst_indices[p] = idx; + } + } + } + + return cache; +} + +bool shell_pack_fast_only(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn) +{ + const int DIMh = (src->dumyd == -1) ? dim : 1; + if (DIMh == 3) + return shell_pack3d_fast_all(data, out_base, src, vars, ordn); + if (DIMh == 2) + return shell_pack2d_fast_all(data, out_base, src, vars, ordn); + if (DIMh == 1) + return shell_pack1d_fast_all(data, out_base, src, vars, ordn); + return false; +} + +bool shell_pack_cuda_fast_points(double *data, + const std::vector &active_points, + const std::vector &vars, + const std::vector &fast_points, + int ordn) +{ +#if USE_CUDA_BSSN + if (!shell_cuda_interp_enabled() || active_points.empty() || vars.empty()) + return false; + + std::vector point_to_active; + point_to_active.reserve(active_points.size()); + std::map block_index; + std::vector blocks; + + for (size_t p = 0; p < active_points.size(); ++p) + { + if (!fast_points[p]) + continue; + ShellPatch::pointstru *src = active_points[p].src; + const int DIMh = (src->dumyd == -1) ? dim : 1; + if (DIMh != 3 && DIMh != 1) + continue; + point_to_active.push_back((int)p); + if (block_index.find(src->Bg) == block_index.end()) + { + int next = (int)blocks.size(); + block_index[src->Bg] = next; + blocks.push_back(src->Bg); + } + } + + const int npoints = (int)point_to_active.size(); + const int nvars = (int)vars.size(); + const int nblocks = (int)blocks.size(); + if (npoints <= 0 || nvars <= 0 || nblocks <= 0) + return false; + + std::vector block_var_fields((size_t)nblocks * nvars, 0); + std::vector block_shapes((size_t)nblocks * 3, 0); + for (int b = 0; b < nblocks; ++b) + { + Block *bg = blocks[b]; + block_shapes[3 * b + 0] = bg->shape[0]; + block_shapes[3 * b + 1] = bg->shape[1]; + block_shapes[3 * b + 2] = bg->shape[2]; + for (int v = 0; v < nvars; ++v) + block_var_fields[(size_t)b * nvars + v] = bg->fgfs[vars[v]->sgfn]; + } + + std::vector point_block(npoints, 0); + std::vector point_dimh(npoints, 0); + std::vector point_dumyd(npoints, -1); + std::vector point_sind((size_t)npoints * 3, 0); + std::vector point_coef((size_t)npoints * 3 * ordn, 0.0); + std::vector out((size_t)npoints * nvars, 0.0); + + for (int q = 0; q < npoints; ++q) + { + const int p = point_to_active[q]; + ShellPatch::pointstru *src = active_points[p].src; + const int DIMh = (src->dumyd == -1) ? dim : 1; + point_block[q] = block_index[src->Bg]; + point_dimh[q] = DIMh; + point_dumyd[q] = src->dumyd; + point_sind[(size_t)q * 3 + 0] = src->sind[0]; + point_sind[(size_t)q * 3 + 1] = src->sind[1]; + point_sind[(size_t)q * 3 + 2] = src->sind[2]; + const int coef_count = (DIMh == 3) ? 3 * ordn : ordn; + for (int c = 0; c < coef_count; ++c) + point_coef[(size_t)q * 3 * ordn + c] = src->coef[c]; + } + + int rc = bssn_cuda_shell_pack_host_fields(npoints, nvars, nblocks, ordn, + block_var_fields.data(), + block_shapes.data(), + point_block.data(), + point_dimh.data(), + point_dumyd.data(), + point_sind.data(), + point_coef.data(), + out.data()); + if (rc != 0) + return false; + + for (int q = 0; q < npoints; ++q) + { + const int p = point_to_active[q]; + const int base = p * nvars; + for (int v = 0; v < nvars; ++v) + { + data[base + v] = out[(size_t)q * nvars + v]; + shell_note_pack(point_dimh[q], true); + } + } + return true; +#else + (void)data; + (void)active_points; + (void)vars; + (void)fast_points; + (void)ordn; + return false; +#endif +} + +void shell_prepare_interp_coeffs(ShellPatch::pointstru *pt, int ordn) +{ + if (pt->coef) + return; + + const int DIMh = (pt->dumyd == -1) ? dim : 1; + pt->coef = new double[ordn * DIMh]; + pt->sind = new int[dim]; + if (DIMh == 3) + { + for (int i = 0; i < DIMh; i++) + { + double dd = pt->Bg->getdX(i); + pt->sind[i] = int((pt->lpox[i] - pt->Bg->X[i][0]) / dd) - ordn / 2 + 1; + double h1, h2; + for (int j = 0; j < ordn; j++) + { + h1 = pt->Bg->X[i][0] + (pt->sind[i] + j) * dd; + pt->coef[i * ordn + j] = 1; + for (int k = 0; k < j; k++) + { + h2 = pt->Bg->X[i][0] + (pt->sind[i] + k) * dd; + pt->coef[i * ordn + j] *= (pt->lpox[i] - h2) / (h1 - h2); + } + for (int k = j + 1; k < ordn; k++) + { + h2 = pt->Bg->X[i][0] + (pt->sind[i] + k) * dd; + pt->coef[i * ordn + j] *= (pt->lpox[i] - h2) / (h1 - h2); + } + } + } + } + else + { + int actd = 1 - pt->dumyd; + double dd = pt->Bg->getdX(actd); + pt->sind[0] = int((pt->lpox[actd] - pt->Bg->X[actd][0]) / dd) - ordn / 2 + 1; + double h1, h2; + for (int j = 0; j < ordn; j++) + { + h1 = pt->Bg->X[actd][0] + (pt->sind[0] + j) * dd; + pt->coef[j] = 1; + for (int k = 0; k < j; k++) + { + h2 = pt->Bg->X[actd][0] + (pt->sind[0] + k) * dd; + pt->coef[j] *= (pt->lpox[actd] - h2) / (h1 - h2); + } + for (int k = j + 1; k < ordn; k++) + { + h2 = pt->Bg->X[actd][0] + (pt->sind[0] + k) * dd; + pt->coef[j] *= (pt->lpox[actd] - h2) / (h1 - h2); + } + } + pt->sind[2] = int((pt->lpox[2] - pt->Bg->X[2][0]) / pt->Bg->getdX(2) + 0.001); + if (!feq(pt->Bg->X[2][pt->sind[2]], pt->lpox[2], pt->Bg->getdX(2) / 2000)) + cout << "error in ShellPatch::interdata_packer point = " << pt->lpox[2] << " != grid " << pt->Bg->X[2][pt->sind[2]] << endl; + pt->sind[1] = int((pt->lpox[pt->dumyd] - pt->Bg->X[pt->dumyd][0]) / + pt->Bg->getdX(pt->dumyd) + + 0.001); + if (!feq(pt->Bg->X[pt->dumyd][pt->sind[1]], pt->lpox[pt->dumyd], pt->Bg->getdX(pt->dumyd) / 2000)) + cout << "error in ShellPatch::interdata_packer for dumy dimension point = " + << pt->lpox[pt->dumyd] << " != grid " << pt->Bg->X[pt->dumyd][pt->sind[1]] << endl; + } +} + +void shell_pack_one(double *data, int out_base, ShellPatch::pointstru *src, + const std::vector &vars, int ordn, int Symmetry) +{ + shell_prepare_interp_coeffs(src, ordn); + const int DIMh = (src->dumyd == -1) ? dim : 1; + for (size_t vi = 0; vi < vars.size(); ++vi) + { + double &out = data[out_base + (int)vi]; + bool fast_done = false; + if (shell_fast_interp_enabled()) + { + if (DIMh == 3) + fast_done = shell_interp3d_fast(src, vars[vi], out, ordn); + else if (DIMh == 2) + fast_done = shell_interp2d_fast(src, vars[vi], out, ordn); + else if (DIMh == 1) + fast_done = shell_interp1d_fast(src, vars[vi], out, ordn); + } + if (fast_done) + { + shell_note_pack(DIMh, true); + continue; + } + shell_note_pack(DIMh, false); + switch (DIMh) + { + case 3: + f_global_interpind(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2], + src->Bg->fgfs[vars[vi]->sgfn], out, + src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry, + src->sind, src->coef, src->ssst); + break; + case 2: + f_global_interpind2d(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2], + src->Bg->fgfs[vars[vi]->sgfn], out, + src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry, + src->sind, src->coef, src->ssst); + break; + case 1: + f_global_interpind1d(src->Bg->shape, src->Bg->X[0], src->Bg->X[1], src->Bg->X[2], + src->Bg->fgfs[vars[vi]->sgfn], out, + src->lpox[0], src->lpox[1], src->lpox[2], ordn, vars[vi]->SoA, Symmetry, + src->sind, src->coef, src->ssst, src->dumyd); + break; + default: + cout << "ShellPatch::interdata_packer: not recognized DIM = " << DIMh << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +} + +void shell_unpack_one(double *data, int out_base, ShellPatch::pointstru *dst, + const std::vector &vars) +{ + int DIM = dim; + for (size_t vi = 0; vi < vars.size(); ++vi) + f_pointcopy(DIM, dst->Bg->bbox, dst->Bg->bbox + dim, dst->Bg->shape, + dst->Bg->fgfs[vars[vi]->sgfn], + dst->lpox[0], dst->lpox[1], dst->lpox[2], + data[out_base + (int)vi]); +} + +} ss_patch::ss_patch(int ingfsi, int fngfsi, int *shapei, double *bboxi, int myranki) : ingfs(ingfsi), fngfs(fngfsi), myrank(myranki), blb(0), ble(0) { @@ -2544,17 +3491,32 @@ double *ShellPatch::Collect_Data(ss_patch *PP, var *VP) return databuffer; } -void ShellPatch::intertransfer(MyList **src, MyList **dst, - MyList *VarList1 /* source */, MyList *VarList2 /*target */, - int Symmetry) -{ +void ShellPatch::intertransfer(MyList **src, MyList **dst, + MyList *VarList1 /* source */, MyList *VarList2 /*target */, + int Symmetry) +{ int myrank, cpusize; MPI_Comm_size(MPI_COMM_WORLD, &cpusize); MPI_Comm_rank(MPI_COMM_WORLD, &myrank); - int node; - - MPI_Request *reqs; + int node; + const bool transfer_timing = shell_transfer_timing_enabled(); + double t_query = 0.0; + double t_pack = 0.0; + double t_post = 0.0; + double t_wait = 0.0; + double t_unpack = 0.0; + long long send_values = 0; + long long recv_values = 0; + double tt = 0.0; + +#if USE_CUDA_BSSN + const bool use_shell_cuda_pack = shell_cuda_interp_enabled(); + if (use_shell_cuda_pack) + bssn_cuda_shell_pack_cache_begin(); +#endif + + MPI_Request *reqs; MPI_Status *stats; reqs = new MPI_Request[2 * cpusize]; stats = new MPI_Status[2 * cpusize]; @@ -2568,55 +3530,109 @@ void ShellPatch::intertransfer(MyList **src, MyList **dst, for (node = 0; node < cpusize; node++) { send_data[node] = rec_data[node] = 0; - if (node == myrank) - { - if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 1" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - } - } - else - { - // send from this cpu to cpu#node - if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) - { - send_data[node] = new double[length]; - if (!send_data[node]) - { - cout << "out of memory when new in short transfer, place 2" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); - MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - // receive from cpu#node to this cpu - if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) - { - rec_data[node] = new double[length]; - if (!rec_data[node]) - { - cout << "out of memory when new in short transfer, place 3" << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); - } - } - } - // wait for all requests to complete - MPI_Waitall(req_no, reqs, stats); - - for (node = 0; node < cpusize; node++) - if (rec_data[node]) - interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); - - for (node = 0; node < cpusize; node++) - { + if (node == myrank) + { + if (transfer_timing) tt = MPI_Wtime(); + if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) + { + if (transfer_timing) t_query += MPI_Wtime() - tt; + rec_data[node] = new double[length]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 1" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + recv_values += length; + if (transfer_timing) tt = MPI_Wtime(); + interdata_packer(rec_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (transfer_timing) t_pack += MPI_Wtime() - tt; + } + else if (transfer_timing) + t_query += MPI_Wtime() - tt; + } + else + { + // send from this cpu to cpu#node + if (transfer_timing) tt = MPI_Wtime(); + if (length = interdata_packer(0, src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry)) + { + if (transfer_timing) t_query += MPI_Wtime() - tt; + send_data[node] = new double[length]; + if (!send_data[node]) + { + cout << "out of memory when new in short transfer, place 2" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + send_values += length; + if (transfer_timing) tt = MPI_Wtime(); + interdata_packer(send_data[node], src[myrank], dst[myrank], node, PACK, VarList1, VarList2, Symmetry); + if (transfer_timing) t_pack += MPI_Wtime() - tt; + if (transfer_timing) tt = MPI_Wtime(); + MPI_Isend((void *)send_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); + if (transfer_timing) t_post += MPI_Wtime() - tt; + } + else if (transfer_timing) + t_query += MPI_Wtime() - tt; + // receive from cpu#node to this cpu + if (transfer_timing) tt = MPI_Wtime(); + if (length = interdata_packer(0, src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry)) + { + if (transfer_timing) t_query += MPI_Wtime() - tt; + rec_data[node] = new double[length]; + if (!rec_data[node]) + { + cout << "out of memory when new in short transfer, place 3" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + recv_values += length; + if (transfer_timing) tt = MPI_Wtime(); + MPI_Irecv((void *)rec_data[node], length, MPI_DOUBLE, node, 1, MPI_COMM_WORLD, reqs + req_no++); + if (transfer_timing) t_post += MPI_Wtime() - tt; + } + else if (transfer_timing) + t_query += MPI_Wtime() - tt; + } + } + // wait for all requests to complete + if (transfer_timing) tt = MPI_Wtime(); + MPI_Waitall(req_no, reqs, stats); + if (transfer_timing) t_wait += MPI_Wtime() - tt; + +#if USE_CUDA_BSSN + if (use_shell_cuda_pack) + bssn_cuda_shell_pack_cache_end(); +#endif + + for (node = 0; node < cpusize; node++) + if (rec_data[node]) + { + if (transfer_timing) tt = MPI_Wtime(); + interdata_packer(rec_data[node], src[node], dst[node], node, UNPACK, VarList1, VarList2, Symmetry); + if (transfer_timing) t_unpack += MPI_Wtime() - tt; + } + + if (transfer_timing) + { + double local[5] = {t_query, t_pack, t_post, t_wait, t_unpack}; + double maxv[5] = {}; + long long counts[2] = {send_values, recv_values}; + long long max_counts[2] = {}; + MPI_Reduce(local, maxv, 5, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + MPI_Reduce(counts, max_counts, 2, MPI_LONG_LONG, MPI_MAX, 0, MPI_COMM_WORLD); + if (myrank == 0) + cout << "[AMSS-SHELL-TRANSFER] vars=" << shell_var_count(VarList1) + << " query=" << maxv[0] + << " pack=" << maxv[1] + << " post=" << maxv[2] + << " wait=" << maxv[3] + << " unpack=" << maxv[4] + << " send_values=" << max_counts[0] + << " recv_values=" << max_counts[1] + << endl; + } + + for (node = 0; node < cpusize; node++) + { if (send_data[node]) delete[] send_data[node]; if (rec_data[node]) @@ -2660,14 +3676,77 @@ int ShellPatch::interdata_packer(double *data, MyList *src, MyListnext; } - if (varls || varld) - { - cout << "error in short data packer, var lists does not match." << endl; - MPI_Abort(MPI_COMM_WORLD, 1); - } - - while (src && dst) - { + if (varls || varld) + { + cout << "error in short data packer, var lists does not match." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + if (!data || shell_parallel_interp_enabled() || shell_cuda_interp_enabled()) + { + ShellActiveCache &active_cache = shell_get_active_cache(src, dst, rank_in, dir, VarLists, VarListd, myrank, ordn); + const std::vector &src_vars = active_cache.src_vars; + const std::vector &dst_vars = active_cache.dst_vars; + const std::vector &active_points = active_cache.active_points; + const std::vector &fast_points = active_cache.fast_points; + const std::vector &dst_indices = active_cache.dst_indices; + + if (!data) + return (int)(active_points.size() * src_vars.size()); + + if (active_points.size() * src_vars.size() >= 2048) + { + const int nthreads = std::min(shell_interp_threads(), (int)active_points.size()); + const int nvar = (int)src_vars.size(); + + bool cuda_pack_done = false; + if (dir == PACK) + cuda_pack_done = shell_pack_cuda_fast_points(data, active_points, src_vars, fast_points, ordn); + + std::vector workers; + workers.reserve(nthreads); + if (!cuda_pack_done) + { + for (int tid = 0; tid < nthreads; ++tid) + { + const int begin = (int)((long long)active_points.size() * tid / nthreads); + const int end = (int)((long long)active_points.size() * (tid + 1) / nthreads); + workers.push_back(std::thread([&, begin, end]() { + for (int p = begin; p < end; ++p) + { + const int base = p * nvar; + if (!fast_points[p]) + continue; + if (dir == PACK) + shell_pack_fast_only(data, base, active_points[p].src, src_vars, ordn); + else + { + for (int vi = 0; vi < nvar; ++vi) + active_points[p].dst->Bg->fgfs[dst_vars[vi]->sgfn][dst_indices[p]] = data[base + vi]; + } + } + })); + } + for (size_t t = 0; t < workers.size(); ++t) + workers[t].join(); + } + + for (size_t p = 0; p < active_points.size(); ++p) + { + if (fast_points[p]) + continue; + const int base = (int)p * nvar; + if (dir == PACK) + shell_pack_one(data, base, active_points[p].src, src_vars, ordn, Symmetry); + else + shell_unpack_one(data, base, active_points[p].dst, dst_vars); + } + return (int)(active_points.size() * src_vars.size()); + } + } + + while (src && dst) + { if ((dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) { @@ -2746,11 +3825,23 @@ int ShellPatch::interdata_packer(double *data, MyList *src, MyListdata->lpox[src->data->dumyd] << " != grid " << src->data->Bg->X[src->data->dumyd][src->data->sind[1]] << endl; } } - // interpolate - switch (DIMh) - { - case 3: - f_global_interpind(src->data->Bg->shape, src->data->Bg->X[0], src->data->Bg->X[1], src->data->Bg->X[2], + // interpolate + bool fast_done = false; + if (shell_fast_interp_enabled()) + { + if (DIMh == 3) + fast_done = shell_interp3d_fast(src->data, varls->data, data[size_out], ordn); + else if (DIMh == 2) + fast_done = shell_interp2d_fast(src->data, varls->data, data[size_out], ordn); + else if (DIMh == 1) + fast_done = shell_interp1d_fast(src->data, varls->data, data[size_out], ordn); + } + shell_note_pack(DIMh, fast_done); + if (!fast_done) + switch (DIMh) + { + case 3: + f_global_interpind(src->data->Bg->shape, src->data->Bg->X[0], src->data->Bg->X[1], src->data->Bg->X[2], src->data->Bg->fgfs[varls->data->sgfn], data[size_out], src->data->lpox[0], src->data->lpox[1], src->data->lpox[2], ordn, varls->data->SoA, Symmetry, src->data->sind, src->data->coef, src->data->ssst); @@ -2771,11 +3862,13 @@ int ShellPatch::interdata_packer(double *data, MyList *src, MyList *VarList, int Symmetry) +{ // fill shell first - intertransfer(csats_src, csats_dst, VarList, VarList, Symmetry); - // fill box then - intertransfer(csatc_src, csatc_dst, VarList, VarList, Symmetry); -} + intertransfer(csats_src, csats_dst, VarList, VarList, Symmetry); + // fill box then + intertransfer(csatc_src, csatc_dst, VarList, VarList, Symmetry); + shell_print_interp_stats("after CS_Inter"); +} void ShellPatch::check_pointstrul(MyList *pp, bool first_only) {