From 9e51793499eab22aa59ad05d2e11ab8b679fcb64 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 14 May 2026 02:27:21 +0800 Subject: [PATCH] Add full FD order support (2nd/4th/6th/8th) to C derivative kernels via ghost_width dispatch Wrap each C kernel in #if (ghost_width == N) blocks matching Fortran stencil coefficients from diff_new.f90, kodiss.f90, and lopsidediff.f90. Add fast-path indexing for ord=1,4,5 in share_func.h. Co-Authored-By: Claude Opus 4.7 --- AMSS_NCKU_source/fdderivs_c.C | 1056 ++++++++++++++++++++------- AMSS_NCKU_source/fderivs_c.C | 687 ++++++++++++++--- AMSS_NCKU_source/kodiss_c.C | 364 +++++++-- AMSS_NCKU_source/lopsided_c.C | 780 ++++++++++++++------ AMSS_NCKU_source/lopsided_kodis_c.C | 554 ++++++++------ AMSS_NCKU_source/share_func.h | 48 +- 6 files changed, 2612 insertions(+), 877 deletions(-) diff --git a/AMSS_NCKU_source/fdderivs_c.C b/AMSS_NCKU_source/fdderivs_c.C index 4ae31d4..ea9d069 100644 --- a/AMSS_NCKU_source/fdderivs_c.C +++ b/AMSS_NCKU_source/fdderivs_c.C @@ -1,4 +1,13 @@ +#include "macrodef.h" #include "tool.h" + +/* + * C 版 fdderivs — second derivatives d2f/dx2, d2f/dxdy, d2f/dxdz, d2f/dy2, d2f/dydz, d2f/dz2. + * + * Finite difference order selected at compile time via ghost_width macro. + * Multi-pass skip strategy: lowest-order computes widest region while skipping + * the union of higher-order regions, then each higher pass overwrites its interior. + */ void fdderivs(const int ex[3], const double *f, double *fxx, double *fxy, double *fxz, @@ -11,83 +20,179 @@ void fdderivs(const int ex[3], const int NO_SYMM = 0, EQ_SYMM = 1; const double ZEO = 0.0, ONE = 1.0, TWO = 2.0; - const double F1o4 = 2.5e-1; // 1/4 + const double F1o4 = 2.5e-1; const double F8 = 8.0; const double F16 = 16.0; const double F30 = 30.0; const double F1o12 = ONE / 12.0; const double F1o144 = ONE / 144.0; + const double F9 = 9.0, F45 = 45.0, F60 = 60.0; + const double F27 = 27.0, F270 = 270.0, F490 = 490.0; + const double F1o180 = ONE / 180.0; + const double F1o3600 = ONE / 3600.0; + const double F32 = 32.0, F128 = 128.0, F168 = 168.0, F672 = 672.0; + const double F840 = 840.0, F1008 = 1008.0, F8064 = 8064.0, F14350 = 14350.0; + const double F1o5040 = ONE / 5040.0; + const double F1o705600 = ONE / 705600.0; const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; - const double dX = X[1] - X[0]; const double dY = Y[1] - Y[0]; const double dZ = Z[1] - Z[0]; - const int imaxF = ex1; - const int jmaxF = ex2; - const int kmaxF = ex3; + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; - int iminF = 1, jminF = 1, kminF = 1; - if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; - if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; - if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; - - const double SoA[3] = { SYM1, SYM2, SYM3 }; - - /* fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 */ - const size_t nx = (size_t)ex1 + 2; - const size_t ny = (size_t)ex2 + 2; - const size_t nz = (size_t)ex3 + 2; - const size_t fh_size = nx * ny * nz; - - static double *fh = NULL; - static size_t cap = 0; - - if (fh_size > cap) { - free(fh); - fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); - cap = fh_size; - } - // double *fh = (double*)malloc(fh_size * sizeof(double)); - if (!fh) return; - - symmetry_bd(2, ex, f, fh, SoA); - - /* 系数:按 Fortran 原式 */ - const double Sdxdx = ONE / (dX * dX); - const double Sdydy = ONE / (dY * dY); - const double Sdzdz = ONE / (dZ * dZ); - - const double Fdxdx = F1o12 / (dX * dX); - const double Fdydy = F1o12 / (dY * dY); - const double Fdzdz = F1o12 / (dZ * dZ); - - const double Sdxdy = F1o4 / (dX * dY); - const double Sdxdz = F1o4 / (dX * dZ); - const double Sdydz = F1o4 / (dY * dZ); - - const double Fdxdy = F1o144 / (dX * dY); - const double Fdxdz = F1o144 / (dX * dZ); - const double Fdydz = F1o144 / (dY * dZ); - - /* 只清零不被主循环覆盖的边界面 */ +#if (ghost_width == 2) + /* ---- 2nd-order ------------------------------------------------------ */ { - /* 高边界:k0=ex3-1 */ + const int ord = 1; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = 0; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = 0; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = 0; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + const double Sdxdx = ONE / (dX * dX); + const double Sdydy = ONE / (dY * dY); + const double Sdzdz = ONE / (dZ * dZ); + const double Sdxdy = F1o4 / (dX * dY); + const double Sdxdz = F1o4 / (dX * dZ); + const double Sdydz = F1o4 / (dY * dZ); + + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO; + fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; + } + + const int i2_lo = (iminF > 0) ? iminF : 0; + const int j2_lo = (jminF > 0) ? jminF : 0; + const int k2_lo = (kminF > 0) ? kminF : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; + + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Sdxdx * ( + fh[idx_fh_F_ord1(iF - 1, jF, kF, ex)] - + TWO * fh[idx_fh_F_ord1(iF, jF, kF, ex)] + + fh[idx_fh_F_ord1(iF + 1, jF, kF, ex)]); + + fyy[p] = Sdydy * ( + fh[idx_fh_F_ord1(iF, jF - 1, kF, ex)] - + TWO * fh[idx_fh_F_ord1(iF, jF, kF, ex)] + + fh[idx_fh_F_ord1(iF, jF + 1, kF, ex)]); + + fzz[p] = Sdzdz * ( + fh[idx_fh_F_ord1(iF, jF, kF - 1, ex)] - + TWO * fh[idx_fh_F_ord1(iF, jF, kF, ex)] + + fh[idx_fh_F_ord1(iF, jF, kF + 1, ex)]); + + fxy[p] = Sdxdy * ( + fh[idx_fh_F_ord1(iF - 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord1(iF + 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord1(iF - 1, jF + 1, kF, ex)] + + fh[idx_fh_F_ord1(iF + 1, jF + 1, kF, ex)]); + + fxz[p] = Sdxdz * ( + fh[idx_fh_F_ord1(iF - 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord1(iF + 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord1(iF - 1, jF, kF + 1, ex)] + + fh[idx_fh_F_ord1(iF + 1, jF, kF + 1, ex)]); + + fyz[p] = Sdydz * ( + fh[idx_fh_F_ord1(iF, jF - 1, kF - 1, ex)] - + fh[idx_fh_F_ord1(iF, jF + 1, kF - 1, ex)] - + fh[idx_fh_F_ord1(iF, jF - 1, kF + 1, ex)] + + fh[idx_fh_F_ord1(iF, jF + 1, kF + 1, ex)]); + } + } + } + } + return; + } +#elif (ghost_width == 3) + /* ---- 4th-order (original code) ------------------------------------ */ + { + const int ord = 2; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + const double Sdxdx = ONE / (dX * dX); + const double Sdydy = ONE / (dY * dY); + const double Sdzdz = ONE / (dZ * dZ); + const double Fdxdx = F1o12 / (dX * dX); + const double Fdydy = F1o12 / (dY * dY); + const double Fdzdz = F1o12 / (dZ * dZ); + const double Sdxdy = F1o4 / (dX * dY); + const double Sdxdz = F1o4 / (dX * dZ); + const double Sdydz = F1o4 / (dY * dZ); + const double Fdxdy = F1o144 / (dX * dY); + const double Fdxdz = F1o144 / (dX * dZ); + const double Fdydz = F1o144 / (dY * dZ); + + /* zero high-boundary faces (points the loops below won't cover) */ for (int j0 = 0; j0 < ex2; ++j0) for (int i0 = 0; i0 < ex1; ++i0) { const size_t p = idx_ex(i0, j0, ex3 - 1, ex); fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO; fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; } - /* 高边界:j0=ex2-1 */ for (int k0 = 0; k0 < ex3 - 1; ++k0) for (int i0 = 0; i0 < ex1; ++i0) { const size_t p = idx_ex(i0, ex2 - 1, k0, ex); fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO; fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; } - /* 高边界:i0=ex1-1 */ for (int k0 = 0; k0 < ex3 - 1; ++k0) for (int j0 = 0; j0 < ex2 - 1; ++j0) { const size_t p = idx_ex(ex1 - 1, j0, k0, ex); @@ -95,7 +200,6 @@ void fdderivs(const int ex[3], fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; } - /* 低边界:当二阶模板也不可用时,对应 i0/j0/k0=0 面 */ if (kminF == 1) { for (int j0 = 0; j0 < ex2; ++j0) for (int i0 = 0; i0 < ex1; ++i0) { @@ -120,213 +224,671 @@ void fdderivs(const int ex[3], fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; } } - } - /* - * 两段式: - * 1) 二阶可用区域先计算二阶模板 - * 2) 高阶可用区域再覆盖四阶模板 - */ - const int i2_lo = (iminF > 0) ? iminF : 0; - const int j2_lo = (jminF > 0) ? jminF : 0; - const int k2_lo = (kminF > 0) ? kminF : 0; - const int i2_hi = ex1 - 2; - const int j2_hi = ex2 - 2; - const int k2_hi = ex3 - 2; + const int i2_lo = (iminF > 0) ? iminF : 0; + const int j2_lo = (jminF > 0) ? jminF : 0; + const int k2_lo = (kminF > 0) ? kminF : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; - const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; - const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; - const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; - const int i4_hi = ex1 - 3; - const int j4_hi = ex2 - 3; - const int k4_hi = ex3 - 3; + const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i4_hi = ex1 - 3; + const int j4_hi = ex2 - 3; + const int k4_hi = ex3 - 3; - /* - * Strategy A: - * Avoid redundant work in overlap of 2nd/4th-order regions. - * Only compute 2nd-order on shell points that are NOT overwritten by - * the 4th-order pass. - */ - const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi); + const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi); - if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { - for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { - const int kF = k0 + 1; - for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { - const int jF = j0 + 1; - for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { - if (has4 && - i0 >= i4_lo && i0 <= i4_hi && - j0 >= j4_lo && j0 <= j4_hi && - k0 >= k4_lo && k0 <= k4_hi) { - continue; - } - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + if (has4 && + i0 >= i4_lo && i0 <= i4_hi && + j0 >= j4_lo && j0 <= j4_hi && + k0 >= k4_lo && k0 <= k4_hi) { + continue; + } + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); - fxx[p] = Sdxdx * ( - fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - - TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + - fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - ); + fxx[p] = Sdxdx * ( + fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]); - fyy[p] = Sdydy * ( - fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - - TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + - fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - ); + fyy[p] = Sdydy * ( + fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]); - fzz[p] = Sdzdz * ( - fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - - TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + - fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - ); + fzz[p] = Sdzdz * ( + fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + TWO * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]); - fxy[p] = Sdxdy * ( - fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] - - fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - - fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] - ); + fxy[p] = Sdxdy * ( + fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - + fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)]); - fxz[p] = Sdxdz * ( - fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] - ); + fxz[p] = Sdxdz * ( + fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)]); - fyz[p] = Sdydz * ( - fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] - ); - } - } - } - } - - if (has4) { - for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { - const int kF = k0 + 1; - for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { - const int jF = j0 + 1; - for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); - - fxx[p] = Fdxdx * ( - -fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - - F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - ); - - fyy[p] = Fdydy * ( - -fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - - F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - ); - - fzz[p] = Fdzdz * ( - -fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - - F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + - F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - ); - - { - const double t_jm2 = - ( fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)] ); - - const double t_jm1 = - ( fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)] ); - - const double t_jp1 = - ( fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)] ); - - const double t_jp2 = - ( fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)] ); - - fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 ); - } - - { - const double t_km2 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)] ); - - const double t_km1 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)] ); - - const double t_kp1 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)] ); - - const double t_kp2 = - ( fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] - -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] - +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)] ); - - fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); - } - - { - const double t_km2 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)] ); - - const double t_km1 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)] ); - - const double t_kp1 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)] ); - - const double t_kp2 = - ( fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] - -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] - +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)] ); - - fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + fyz[p] = Sdydz * ( + fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] - + fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)]); } } } } - } - // free(fh); + if (has4) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Fdxdx * ( + -fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]); + + fyy[p] = Fdydy * ( + -fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]); + + fzz[p] = Fdzdz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] - + F30 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + + F16 * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]); + + /* fxy: 5x5 outer product */ + { + const double t_jm2 = ( + fh[idx_fh_F_ord2(iF - 2, jF - 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 2, kF, ex)]); + + const double t_jm1 = ( + fh[idx_fh_F_ord2(iF - 2, jF - 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF - 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF - 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF - 1, kF, ex)]); + + const double t_jp1 = ( + fh[idx_fh_F_ord2(iF - 2, jF + 1, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 1, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 1, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 1, kF, ex)]); + + const double t_jp2 = ( + fh[idx_fh_F_ord2(iF - 2, jF + 2, kF, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF + 2, kF, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF + 2, kF, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF + 2, kF, ex)]); + + fxy[p] = Fdxdy * ( t_jm2 - F8 * t_jm1 + F8 * t_jp1 - t_jp2 ); + } + + /* fxz: 5x5 outer product */ + { + const double t_km2 = ( + fh[idx_fh_F_ord2(iF - 2, jF, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 2, ex)]); + + const double t_km1 = ( + fh[idx_fh_F_ord2(iF - 2, jF, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF - 1, ex)]); + + const double t_kp1 = ( + fh[idx_fh_F_ord2(iF - 2, jF, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 1, ex)]); + + const double t_kp2 = ( + fh[idx_fh_F_ord2(iF - 2, jF, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF - 1, jF, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF + 1, jF, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF + 2, jF, kF + 2, ex)]); + + fxz[p] = Fdxdz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + + /* fyz: 5x5 outer product */ + { + const double t_km2 = ( + fh[idx_fh_F_ord2(iF, jF - 2, kF - 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 2, ex)]); + + const double t_km1 = ( + fh[idx_fh_F_ord2(iF, jF - 2, kF - 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF - 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF - 1, ex)]); + + const double t_kp1 = ( + fh[idx_fh_F_ord2(iF, jF - 2, kF + 1, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 1, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 1, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 1, ex)]); + + const double t_kp2 = ( + fh[idx_fh_F_ord2(iF, jF - 2, kF + 2, ex)] + -F8*fh[idx_fh_F_ord2(iF, jF - 1, kF + 2, ex)] + +F8*fh[idx_fh_F_ord2(iF, jF + 1, kF + 2, ex)] + - fh[idx_fh_F_ord2(iF, jF + 2, kF + 2, ex)]); + + fyz[p] = Fdydz * ( t_km2 - F8 * t_km1 + F8 * t_kp1 - t_kp2 ); + } + } + } + } + } + return; + } +#elif (ghost_width == 4) + /* ---- 6th-order ----------------------------------------------------- */ + { + const int ord = 3; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + /* Denominators */ + const double Sdxdx = ONE / (dX * dX); // 2nd + const double Sdydy = ONE / (dY * dY); + const double Sdzdz = ONE / (dZ * dZ); + const double Fdxdx = F1o12 / (dX * dX); // 4th + const double Fdydy = F1o12 / (dY * dY); + const double Fdzdz = F1o12 / (dZ * dZ); + const double Xdxdx = F1o180 / (dX * dX); // 6th + const double Xdydy = F1o180 / (dY * dY); + const double Xdzdz = F1o180 / (dZ * dZ); + const double Sdxdy = F1o4 / (dX * dY); + const double Sdxdz = F1o4 / (dX * dZ); + const double Sdydz = F1o4 / (dY * dZ); + const double Fdxdy = F1o144 / (dX * dY); + const double Fdxdz = F1o144 / (dX * dZ); + const double Fdydz = F1o144 / (dY * dZ); + const double Xdxdy = F1o3600 / (dX * dY); + const double Xdxdz = F1o3600 / (dX * dZ); + const double Xdydz = F1o3600 / (dY * dZ); + + /* zero everything first */ + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO; + fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; + } + + /* loop bounds for each pass (from widest to narrowest) */ + const int i2_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j2_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k2_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i2_hi = ex1 - 2, j2_hi = ex2 - 2, k2_hi = ex3 - 2; + + const int i4_lo = (iminF + 2 > 0) ? (iminF + 2) : 0; + const int j4_lo = (jminF + 2 > 0) ? (jminF + 2) : 0; + const int k4_lo = (kminF + 2 > 0) ? (kminF + 2) : 0; + const int i4_hi = ex1 - 3, j4_hi = ex2 - 3, k4_hi = ex3 - 3; + + const int i6_lo = (iminF + 3 > 0) ? (iminF + 3) : 0; + const int j6_lo = (jminF + 3 > 0) ? (jminF + 3) : 0; + const int k6_lo = (kminF + 3 > 0) ? (kminF + 3) : 0; + const int i6_hi = ex1 - 4, j6_hi = ex2 - 4, k6_hi = ex3 - 4; + + const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi); + const int has6 = (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi); + + /* 2nd-order: skip 4th+6th overlap */ + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + _Bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi; + if (in4) continue; + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Sdxdx * (fh[idx_fh_F(iF - 1, jF, kF, ex)] - TWO*fh[idx_fh_F(iF,jF,kF,ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]); + fyy[p] = Sdydy * (fh[idx_fh_F(iF, jF - 1, kF, ex)] - TWO*fh[idx_fh_F(iF,jF,kF,ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]); + fzz[p] = Sdzdz * (fh[idx_fh_F(iF, jF, kF - 1, ex)] - TWO*fh[idx_fh_F(iF,jF,kF,ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]); + + fxy[p] = Sdxdy * (fh[idx_fh_F(iF - 1, jF - 1, kF, ex)] - fh[idx_fh_F(iF + 1, jF - 1, kF, ex)] - fh[idx_fh_F(iF - 1, jF + 1, kF, ex)] + fh[idx_fh_F(iF + 1, jF + 1, kF, ex)]); + fxz[p] = Sdxdz * (fh[idx_fh_F(iF - 1, jF, kF - 1, ex)] - fh[idx_fh_F(iF + 1, jF, kF - 1, ex)] - fh[idx_fh_F(iF - 1, jF, kF + 1, ex)] + fh[idx_fh_F(iF + 1, jF, kF + 1, ex)]); + fyz[p] = Sdydz * (fh[idx_fh_F(iF, jF - 1, kF - 1, ex)] - fh[idx_fh_F(iF, jF + 1, kF - 1, ex)] - fh[idx_fh_F(iF, jF - 1, kF + 1, ex)] + fh[idx_fh_F(iF, jF + 1, kF + 1, ex)]); + } + } + } + } + + /* 4th-order: skip 6th overlap */ + if (has4) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + if (has6 && i0>=i6_lo && i0<=i6_hi && j0>=j6_lo && j0<=j6_hi && k0>=k6_lo && k0<=k6_hi) continue; + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Fdxdx * (-fh[idx_fh_F(iF - 2, jF, kF, ex)] + F16*fh[idx_fh_F(iF-1,jF,kF,ex)] - F30*fh[idx_fh_F(iF,jF,kF,ex)] - fh[idx_fh_F(iF+2,jF,kF,ex)] + F16*fh[idx_fh_F(iF+1,jF,kF,ex)]); + fyy[p] = Fdydy * (-fh[idx_fh_F(iF, jF - 2, kF, ex)] + F16*fh[idx_fh_F(iF,jF-1,kF,ex)] - F30*fh[idx_fh_F(iF,jF,kF,ex)] - fh[idx_fh_F(iF,jF+2,kF,ex)] + F16*fh[idx_fh_F(iF,jF+1,kF,ex)]); + fzz[p] = Fdzdz * (-fh[idx_fh_F(iF, jF, kF - 2, ex)] + F16*fh[idx_fh_F(iF,jF,kF-1,ex)] - F30*fh[idx_fh_F(iF,jF,kF,ex)] - fh[idx_fh_F(iF,jF,kF+2,ex)] + F16*fh[idx_fh_F(iF,jF,kF+1,ex)]); + + { + const double t_jm2 = (fh[idx_fh_F(iF-2,jF-2,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF-2,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF-2,kF,ex)]-fh[idx_fh_F(iF+2,jF-2,kF,ex)]); + const double t_jm1 = (fh[idx_fh_F(iF-2,jF-1,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF-1,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF-1,kF,ex)]-fh[idx_fh_F(iF+2,jF-1,kF,ex)]); + const double t_jp1 = (fh[idx_fh_F(iF-2,jF+1,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF+1,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF+1,kF,ex)]-fh[idx_fh_F(iF+2,jF+1,kF,ex)]); + const double t_jp2 = (fh[idx_fh_F(iF-2,jF+2,kF,ex)]-F8*fh[idx_fh_F(iF-1,jF+2,kF,ex)]+F8*fh[idx_fh_F(iF+1,jF+2,kF,ex)]-fh[idx_fh_F(iF+2,jF+2,kF,ex)]); + fxy[p] = Fdxdy * (t_jm2 - F8*t_jm1 + F8*t_jp1 - t_jp2); + } + { + const double t_km2 = (fh[idx_fh_F(iF-2,jF,kF-2,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF-2,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF-2,ex)]-fh[idx_fh_F(iF+2,jF,kF-2,ex)]); + const double t_km1 = (fh[idx_fh_F(iF-2,jF,kF-1,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF-1,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF-1,ex)]-fh[idx_fh_F(iF+2,jF,kF-1,ex)]); + const double t_kp1 = (fh[idx_fh_F(iF-2,jF,kF+1,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF+1,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF+1,ex)]-fh[idx_fh_F(iF+2,jF,kF+1,ex)]); + const double t_kp2 = (fh[idx_fh_F(iF-2,jF,kF+2,ex)]-F8*fh[idx_fh_F(iF-1,jF,kF+2,ex)]+F8*fh[idx_fh_F(iF+1,jF,kF+2,ex)]-fh[idx_fh_F(iF+2,jF,kF+2,ex)]); + fxz[p] = Fdxdz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2); + } + { + const double t_km2 = (fh[idx_fh_F(iF,jF-2,kF-2,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF-2,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF-2,ex)]-fh[idx_fh_F(iF,jF+2,kF-2,ex)]); + const double t_km1 = (fh[idx_fh_F(iF,jF-2,kF-1,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF-1,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF-1,ex)]-fh[idx_fh_F(iF,jF+2,kF-1,ex)]); + const double t_kp1 = (fh[idx_fh_F(iF,jF-2,kF+1,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF+1,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF+1,ex)]-fh[idx_fh_F(iF,jF+2,kF+1,ex)]); + const double t_kp2 = (fh[idx_fh_F(iF,jF-2,kF+2,ex)]-F8*fh[idx_fh_F(iF,jF-1,kF+2,ex)]+F8*fh[idx_fh_F(iF,jF+1,kF+2,ex)]-fh[idx_fh_F(iF,jF+2,kF+2,ex)]); + fyz[p] = Fdydz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2); + } + } + } + } + } + + /* 6th-order: interior only */ + if (has6) { + for (int k0 = k6_lo; k0 <= k6_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j6_lo; j0 <= j6_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i6_lo; i0 <= i6_hi; ++i0) { + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* Diagonal: [+2,-27,+270,-490,+270,-27,+2] / (180*dx^2) */ + fxx[p] = Xdxdx * ( + TWO * fh[idx_fh_F(iF - 3, jF, kF, ex)] - + F27 * fh[idx_fh_F(iF - 2, jF, kF, ex)] + + F270 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - + F490 * fh[idx_fh_F(iF, jF, kF, ex)] + + F270 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - + F27 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + TWO * fh[idx_fh_F(iF + 3, jF, kF, ex)]); + + fyy[p] = Xdydy * ( + TWO * fh[idx_fh_F(iF, jF - 3, kF, ex)] - + F27 * fh[idx_fh_F(iF, jF - 2, kF, ex)] + + F270 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - + F490 * fh[idx_fh_F(iF, jF, kF, ex)] + + F270 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - + F27 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + TWO * fh[idx_fh_F(iF, jF + 3, kF, ex)]); + + fzz[p] = Xdzdz * ( + TWO * fh[idx_fh_F(iF, jF, kF - 3, ex)] - + F27 * fh[idx_fh_F(iF, jF, kF - 2, ex)] + + F270 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - + F490 * fh[idx_fh_F(iF, jF, kF, ex)] + + F270 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - + F27 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + TWO * fh[idx_fh_F(iF, jF, kF + 3, ex)]); + + /* Mixed: 7x7 outer product. Compute 1D x-stencil at each y/z offset, + then combine using 1D y/z weights [-1,+9,-45,0,+45,-9,+1] / (3600*dx*dy) */ + { + // x-stencil: -f(i-3)+9f(i-2)-45f(i-1)+45f(i+1)-9f(i+2)+f(i+3) + // Helper macro would help but explicit is safer + #define XSTEN6(JF, KF_DUMMY) \ + (-fh[idx_fh_F(iF-3,JF,KF_DUMMY,ex)] + F9*fh[idx_fh_F(iF-2,JF,KF_DUMMY,ex)] - F45*fh[idx_fh_F(iF-1,JF,KF_DUMMY,ex)] + F45*fh[idx_fh_F(iF+1,JF,KF_DUMMY,ex)] - F9*fh[idx_fh_F(iF+2,JF,KF_DUMMY,ex)] + fh[idx_fh_F(iF+3,JF,KF_DUMMY,ex)]) + + fxy[p] = Xdxdy * ( + -XSTEN6(jF-3, kF) + F9*XSTEN6(jF-2, kF) - F45*XSTEN6(jF-1, kF) + F45*XSTEN6(jF+1, kF) - F9*XSTEN6(jF+2, kF) + XSTEN6(jF+3, kF)); + + fxz[p] = Xdxdz * ( + -XSTEN6(jF, kF-3) + F9*XSTEN6(jF, kF-2) - F45*XSTEN6(jF, kF-1) + F45*XSTEN6(jF, kF+1) - F9*XSTEN6(jF, kF+2) + XSTEN6(jF, kF+3)); + + #undef XSTEN6 + } + + /* fyz: apply 1D y-stencil at each z offset */ + { + #define YSTEN6(JF, KF_DUMMY) \ + (-fh[idx_fh_F(iF,JF-3,KF_DUMMY,ex)] + F9*fh[idx_fh_F(iF,JF-2,KF_DUMMY,ex)] - F45*fh[idx_fh_F(iF,JF-1,KF_DUMMY,ex)] + F45*fh[idx_fh_F(iF,JF+1,KF_DUMMY,ex)] - F9*fh[idx_fh_F(iF,JF+2,KF_DUMMY,ex)] + fh[idx_fh_F(iF,JF+3,KF_DUMMY,ex)]) + + fyz[p] = Xdydz * ( + -YSTEN6(jF, kF-3) + F9*YSTEN6(jF, kF-2) - F45*YSTEN6(jF, kF-1) + F45*YSTEN6(jF, kF+1) - F9*YSTEN6(jF, kF+2) + YSTEN6(jF, kF+3)); + + #undef YSTEN6 + } + } + } + } + } + return; + } +#elif (ghost_width == 5) + /* ---- 8th-order ----------------------------------------------------- */ + { + const int ord = 4; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + const double Sdxdx = ONE / (dX * dX); + const double Sdydy = ONE / (dY * dY); + const double Sdzdz = ONE / (dZ * dZ); + const double Fdxdx = F1o12 / (dX * dX); + const double Fdydy = F1o12 / (dY * dY); + const double Fdzdz = F1o12 / (dZ * dZ); + const double Xdxdx = F1o180 / (dX * dX); + const double Xdydy = F1o180 / (dY * dY); + const double Xdzdz = F1o180 / (dZ * dZ); + const double Edxdx = F1o5040 / (dX * dX); + const double Edydy = F1o5040 / (dY * dY); + const double Edzdz = F1o5040 / (dZ * dZ); + const double Sdxdy = F1o4 / (dX * dY); + const double Sdxdz = F1o4 / (dX * dZ); + const double Sdydz = F1o4 / (dY * dZ); + const double Fdxdy = F1o144 / (dX * dY); + const double Fdxdz = F1o144 / (dX * dZ); + const double Fdydz = F1o144 / (dY * dZ); + const double Xdxdy = F1o3600 / (dX * dY); + const double Xdxdz = F1o3600 / (dX * dZ); + const double Xdydz = F1o3600 / (dY * dZ); + const double Edxdy = F1o705600 / (dX * dY); + const double Edxdz = F1o705600 / (dX * dZ); + const double Edydz = F1o705600 / (dY * dZ); + + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fxx[p]=ZEO; fyy[p]=ZEO; fzz[p]=ZEO; + fxy[p]=ZEO; fxz[p]=ZEO; fyz[p]=ZEO; + } + + /* Loop bounds for each pass */ + const int i2_lo = (iminF + 2 > 0) ? (iminF + 2) : 0; + const int j2_lo = (jminF + 2 > 0) ? (jminF + 2) : 0; + const int k2_lo = (kminF + 2 > 0) ? (kminF + 2) : 0; + const int i2_hi = ex1 - 2, j2_hi = ex2 - 2, k2_hi = ex3 - 2; + + const int i4_lo = (iminF + 3 > 0) ? (iminF + 3) : 0; + const int j4_lo = (jminF + 3 > 0) ? (jminF + 3) : 0; + const int k4_lo = (kminF + 3 > 0) ? (kminF + 3) : 0; + const int i4_hi = ex1 - 3, j4_hi = ex2 - 3, k4_hi = ex3 - 3; + + const int i6_lo = (iminF + 4 > 0) ? (iminF + 4) : 0; + const int j6_lo = (jminF + 4 > 0) ? (jminF + 4) : 0; + const int k6_lo = (kminF + 4 > 0) ? (kminF + 4) : 0; + const int i6_hi = ex1 - 4, j6_hi = ex2 - 4, k6_hi = ex3 - 4; + + const int i8_lo = (iminF + 5 > 0) ? (iminF + 5) : 0; + const int j8_lo = (jminF + 5 > 0) ? (jminF + 5) : 0; + const int k8_lo = (kminF + 5 > 0) ? (kminF + 5) : 0; + const int i8_hi = ex1 - 5, j8_hi = ex2 - 5, k8_hi = ex3 - 5; + + const int has4 = (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi); + const int has6 = (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi); + const int has8 = (i8_lo <= i8_hi && j8_lo <= j8_hi && k8_lo <= k8_hi); + + /* 2nd-order: skip 4th+6th+8th overlap */ + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + _Bool in4 = has4 && i0>=i4_lo && i0<=i4_hi && j0>=j4_lo && j0<=j4_hi && k0>=k4_lo && k0<=k4_hi; + if (in4) continue; + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Sdxdx * (fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]); + fyy[p] = Sdydy * (fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]); + fzz[p] = Sdzdz * (fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - TWO*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]); + + fxy[p] = Sdxdy * (fh[idx_fh_F_ord4(iF-1,jF-1,kF,ex)] - fh[idx_fh_F_ord4(iF+1,jF-1,kF,ex)] - fh[idx_fh_F_ord4(iF-1,jF+1,kF,ex)] + fh[idx_fh_F_ord4(iF+1,jF+1,kF,ex)]); + fxz[p] = Sdxdz * (fh[idx_fh_F_ord4(iF-1,jF,kF-1,ex)] - fh[idx_fh_F_ord4(iF+1,jF,kF-1,ex)] - fh[idx_fh_F_ord4(iF-1,jF,kF+1,ex)] + fh[idx_fh_F_ord4(iF+1,jF,kF+1,ex)]); + fyz[p] = Sdydz * (fh[idx_fh_F_ord4(iF,jF-1,kF-1,ex)] - fh[idx_fh_F_ord4(iF,jF+1,kF-1,ex)] - fh[idx_fh_F_ord4(iF,jF-1,kF+1,ex)] + fh[idx_fh_F_ord4(iF,jF+1,kF+1,ex)]); + } + } + } + } + + /* 4th-order: skip 6th+8th overlap */ + if (has4) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + _Bool in6 = has6 && i0>=i6_lo && i0<=i6_hi && j0>=j6_lo && j0<=j6_hi && k0>=k6_lo && k0<=k6_hi; + if (in6) continue; + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Fdxdx * (-fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + F16*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + F16*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]); + fyy[p] = Fdydy * (-fh[idx_fh_F_ord4(iF,jF-2,kF,ex)] + F16*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF,jF+2,kF,ex)] + F16*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]); + fzz[p] = Fdzdz * (-fh[idx_fh_F_ord4(iF,jF,kF-2,ex)] + F16*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - F30*fh[idx_fh_F_ord4(iF,jF,kF,ex)] - fh[idx_fh_F_ord4(iF,jF,kF+2,ex)] + F16*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]); + + { + const double t_jm2 = (fh[idx_fh_F_ord4(iF-2,jF-2,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF-2,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF-2,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF-2,kF,ex)]); + const double t_jm1 = (fh[idx_fh_F_ord4(iF-2,jF-1,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF-1,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF-1,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF-1,kF,ex)]); + const double t_jp1 = (fh[idx_fh_F_ord4(iF-2,jF+1,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF+1,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF+1,kF,ex)]); + const double t_jp2 = (fh[idx_fh_F_ord4(iF-2,jF+2,kF,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF+2,kF,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF+2,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF+2,kF,ex)]); + fxy[p] = Fdxdy * (t_jm2 - F8*t_jm1 + F8*t_jp1 - t_jp2); + } + { + const double t_km2 = (fh[idx_fh_F_ord4(iF-2,jF,kF-2,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF-2,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF-2,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF-2,ex)]); + const double t_km1 = (fh[idx_fh_F_ord4(iF-2,jF,kF-1,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF-1,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF-1,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF-1,ex)]); + const double t_kp1 = (fh[idx_fh_F_ord4(iF-2,jF,kF+1,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF+1,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF+1,ex)]); + const double t_kp2 = (fh[idx_fh_F_ord4(iF-2,jF,kF+2,ex)]-F8*fh[idx_fh_F_ord4(iF-1,jF,kF+2,ex)]+F8*fh[idx_fh_F_ord4(iF+1,jF,kF+2,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF+2,ex)]); + fxz[p] = Fdxdz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2); + } + { + const double t_km2 = (fh[idx_fh_F_ord4(iF,jF-2,kF-2,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF-2,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF-2,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF-2,ex)]); + const double t_km1 = (fh[idx_fh_F_ord4(iF,jF-2,kF-1,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF-1,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF-1,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF-1,ex)]); + const double t_kp1 = (fh[idx_fh_F_ord4(iF,jF-2,kF+1,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF+1,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF+1,ex)]); + const double t_kp2 = (fh[idx_fh_F_ord4(iF,jF-2,kF+2,ex)]-F8*fh[idx_fh_F_ord4(iF,jF-1,kF+2,ex)]+F8*fh[idx_fh_F_ord4(iF,jF+1,kF+2,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF+2,ex)]); + fyz[p] = Fdydz * (t_km2 - F8*t_km1 + F8*t_kp1 - t_kp2); + } + } + } + } + } + + /* 6th-order: skip 8th overlap */ + if (has6) { + for (int k0 = k6_lo; k0 <= k6_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j6_lo; j0 <= j6_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i6_lo; i0 <= i6_hi; ++i0) { + if (has8 && i0>=i8_lo && i0<=i8_hi && j0>=j8_lo && j0<=j8_hi && k0>=k8_lo && k0<=k8_hi) continue; + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fxx[p] = Xdxdx * ( + TWO * fh[idx_fh_F_ord4(iF-3,jF,kF,ex)] - F27*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] - F27*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + TWO*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]); + fyy[p] = Xdydy * ( + TWO * fh[idx_fh_F_ord4(iF,jF-3,kF,ex)] - F27*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)] - F27*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)] + TWO*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]); + fzz[p] = Xdzdz * ( + TWO * fh[idx_fh_F_ord4(iF,jF,kF-3,ex)] - F27*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)] + F270*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)] - F490*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + F270*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)] - F27*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)] + TWO*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]); + + { + #define XSTEN6_8(JF, KF_DUMMY) \ + (-fh[idx_fh_F_ord4(iF-3,JF,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord4(iF-2,JF,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord4(iF-1,JF,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord4(iF+1,JF,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord4(iF+2,JF,KF_DUMMY,ex)] + fh[idx_fh_F_ord4(iF+3,JF,KF_DUMMY,ex)]) + fxy[p] = Xdxdy * ( + -XSTEN6_8(jF-3,kF) + F9*XSTEN6_8(jF-2,kF) - F45*XSTEN6_8(jF-1,kF) + F45*XSTEN6_8(jF+1,kF) - F9*XSTEN6_8(jF+2,kF) + XSTEN6_8(jF+3,kF)); + fxz[p] = Xdxdz * ( + -XSTEN6_8(jF,kF-3) + F9*XSTEN6_8(jF,kF-2) - F45*XSTEN6_8(jF,kF-1) + F45*XSTEN6_8(jF,kF+1) - F9*XSTEN6_8(jF,kF+2) + XSTEN6_8(jF,kF+3)); + #undef XSTEN6_8 + } + { + #define YSTEN6_8(JF, KF_DUMMY) \ + (-fh[idx_fh_F_ord4(iF,JF-3,KF_DUMMY,ex)] + F9*fh[idx_fh_F_ord4(iF,JF-2,KF_DUMMY,ex)] - F45*fh[idx_fh_F_ord4(iF,JF-1,KF_DUMMY,ex)] + F45*fh[idx_fh_F_ord4(iF,JF+1,KF_DUMMY,ex)] - F9*fh[idx_fh_F_ord4(iF,JF+2,KF_DUMMY,ex)] + fh[idx_fh_F_ord4(iF,JF+3,KF_DUMMY,ex)]) + fyz[p] = Xdydz * ( + -YSTEN6_8(jF,kF-3) + F9*YSTEN6_8(jF,kF-2) - F45*YSTEN6_8(jF,kF-1) + F45*YSTEN6_8(jF,kF+1) - F9*YSTEN6_8(jF,kF+2) + YSTEN6_8(jF,kF+3)); + #undef YSTEN6_8 + } + } + } + } + } + + /* 8th-order: interior only */ + if (has8) { + for (int k0 = k8_lo; k0 <= k8_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j8_lo; j0 <= j8_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i8_lo; i0 <= i8_hi; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* Diagonal: [-9,+128,-1008,+8064,-14350,+8064,-1008,+128,-9] / (5040*dx^2) */ + fxx[p] = Edxdx * ( + -(double)9 * fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] + + F128 * fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] - + F1008 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] + + F8064 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] - + F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] + + F8064 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] - + F1008 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] + + F128 * fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)] - + (double)9 * fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]); + + fyy[p] = Edydy * ( + -(double)9 * fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] + + F128 * fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] - + F1008 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] + + F8064 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] - + F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] + + F8064 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] - + F1008 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] + + F128 * fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)] - + (double)9 * fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]); + + fzz[p] = Edzdz * ( + -(double)9 * fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] + + F128 * fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] - + F1008 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] + + F8064 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] - + F14350* fh[idx_fh_F_ord4(iF, jF, kF, ex)] + + F8064 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] - + F1008 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] + + F128 * fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)] - + (double)9 * fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]); + + /* Mixed: 9x9 outer product. + x-stencil: +3*f(i-4)-32*f(i-3)+168*f(i-2)-672*f(i-1)+672*f(i+1)-168*f(i+2)+32*f(i+3)-3*f(i+4) + y/z weights: same [+3,-32,+168,-672,+672,-168,+32,-3] / 705600 */ + { + #define XSTEN8(JF, KF_DUMMY) \ + (+(double)3*fh[idx_fh_F_ord4(iF-4,JF,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord4(iF-3,JF,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord4(iF-2,JF,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord4(iF-1,JF,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord4(iF+1,JF,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord4(iF+2,JF,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord4(iF+3,JF,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord4(iF+4,JF,KF_DUMMY,ex)]) + + fxy[p] = Edxdy * ( + +(double)3*XSTEN8(jF-4,kF) - F32*XSTEN8(jF-3,kF) + F168*XSTEN8(jF-2,kF) - F672*XSTEN8(jF-1,kF) + F672*XSTEN8(jF+1,kF) - F168*XSTEN8(jF+2,kF) + F32*XSTEN8(jF+3,kF) - (double)3*XSTEN8(jF+4,kF)); + + fxz[p] = Edxdz * ( + +(double)3*XSTEN8(jF,kF-4) - F32*XSTEN8(jF,kF-3) + F168*XSTEN8(jF,kF-2) - F672*XSTEN8(jF,kF-1) + F672*XSTEN8(jF,kF+1) - F168*XSTEN8(jF,kF+2) + F32*XSTEN8(jF,kF+3) - (double)3*XSTEN8(jF,kF+4)); + + #undef XSTEN8 + } + { + #define YSTEN8(JF, KF_DUMMY) \ + (+(double)3*fh[idx_fh_F_ord4(iF,JF-4,KF_DUMMY,ex)] - F32*fh[idx_fh_F_ord4(iF,JF-3,KF_DUMMY,ex)] + F168*fh[idx_fh_F_ord4(iF,JF-2,KF_DUMMY,ex)] - F672*fh[idx_fh_F_ord4(iF,JF-1,KF_DUMMY,ex)] + F672*fh[idx_fh_F_ord4(iF,JF+1,KF_DUMMY,ex)] - F168*fh[idx_fh_F_ord4(iF,JF+2,KF_DUMMY,ex)] + F32*fh[idx_fh_F_ord4(iF,JF+3,KF_DUMMY,ex)] - (double)3*fh[idx_fh_F_ord4(iF,JF+4,KF_DUMMY,ex)]) + + fyz[p] = Edydz * ( + +(double)3*YSTEN8(jF,kF-4) - F32*YSTEN8(jF,kF-3) + F168*YSTEN8(jF,kF-2) - F672*YSTEN8(jF,kF-1) + F672*YSTEN8(jF,kF+1) - F168*YSTEN8(jF,kF+2) + F32*YSTEN8(jF,kF+3) - (double)3*YSTEN8(jF,kF+4)); + + #undef YSTEN8 + } + } + } + } + } + return; + } +#else +#error "fdderivs_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)" +#endif } diff --git a/AMSS_NCKU_source/fderivs_c.C b/AMSS_NCKU_source/fderivs_c.C index 90f10a1..b61c7f8 100644 --- a/AMSS_NCKU_source/fderivs_c.C +++ b/AMSS_NCKU_source/fderivs_c.C @@ -1,14 +1,18 @@ +#include "macrodef.h" #include "tool.h" /* - * C 版 fderivs + * C 版 fderivs — first derivatives df/dx, df/dy, df/dz. * - * Fortran: - * subroutine fderivs(ex,f,fx,fy,fz,X,Y,Z,SYM1,SYM2,SYM3,symmetry,onoff) + * Finite difference order is selected at compile time via the ghost_width macro + * (defined in macrodef.fh): + * ghost_width = 2 → 2nd-order + * ghost_width = 3 → 4th-order + * ghost_width = 4 → 6th-order + * ghost_width = 5 → 8th-order * - * 约定: - * f, fx, fy, fz: ex1*ex2*ex3,按 idx_ex 布局 - * X: ex1, Y: ex2, Z: ex3 + * Multi-pass overwrite strategy: compute the widest (lowest-order) stencil first, + * then overwrite interior regions with progressively higher-order stencils. */ void fderivs(const int ex[3], const double *f, @@ -17,151 +21,596 @@ void fderivs(const int ex[3], double SYM1, double SYM2, double SYM3, int Symmetry, int onoff) { - (void)onoff; // Fortran 里没用到 + (void)onoff; - const double ZEO = 0.0, ONE = 1.0; - const double TWO = 2.0, EIT = 8.0; - const double F12 = 12.0; + const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, EIT = 8.0; + const double F9 = 9.0, F12 = 12.0, F45 = 45.0, F60 = 60.0; + const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0; - const int NO_SYMM = 0, EQ_SYMM = 1; // OCTANT=2 在本子程序里不直接用 + const int NO_SYMM = 0, EQ_SYMM = 1; const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; - - // dX = X(2)-X(1) -> C: X[1]-X[0] const double dX = X[1] - X[0]; const double dY = Y[1] - Y[0]; const double dZ = Z[1] - Z[0]; - // Fortran 1-based bounds - const int imaxF = ex1; - const int jmaxF = ex2; - const int kmaxF = ex3; + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; - int iminF = 1, jminF = 1, kminF = 1; - if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; - if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; - if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; + const int gw = ghost_width; // compile-time constant - // SoA(1:3) = SYM1,SYM2,SYM3 - const double SoA[3] = { SYM1, SYM2, SYM3 }; +#if (ghost_width == 2) + /* ---- 2nd-order ------------------------------------------------------ */ + { + const int ord = 1; // symmetry_bd ord = ghost_width - 1 - // fh: (ex1+2)*(ex2+2)*(ex3+2) because ord=2 - const size_t nx = (size_t)ex1 + 2; - const size_t ny = (size_t)ex2 + 2; - const size_t nz = (size_t)ex3 + 2; - const size_t fh_size = nx * ny * nz; - static double *fh = NULL; - static size_t cap = 0; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = 0; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = 0; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = 0; - if (fh_size > cap) { - free(fh); - fh = (double*)aligned_alloc(64, fh_size * sizeof(double)); - cap = fh_size; - } - // double *fh = (double*)malloc(fh_size * sizeof(double)); - if (!fh) return; + const double SoA[3] = { SYM1, SYM2, SYM3 }; - // call symmetry_bd(2,ex,f,fh,SoA) - symmetry_bd(2, ex, f, fh, SoA); + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; - const double d12dx = ONE / F12 / dX; - const double d12dy = ONE / F12 / dY; - const double d12dz = ONE / F12 / dZ; + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; - const double d2dx = ONE / TWO / dX; - const double d2dy = ONE / TWO / dY; - const double d2dz = ONE / TWO / dZ; + symmetry_bd(ord, ex, f, fh, SoA); - // fx = fy = fz = 0 - const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; - for (size_t p = 0; p < all; ++p) { - fx[p] = ZEO; - fy[p] = ZEO; - fz[p] = ZEO; - } + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; - /* - * 两段式: - * 1) 先在二阶可用区域计算二阶模板 - * 2) 再在高阶可用区域覆盖为四阶模板 - * - * 与原 if/elseif 逻辑等价,但减少逐点分支判断。 - */ - const int i2_lo = (iminF > 0) ? iminF : 0; - const int j2_lo = (jminF > 0) ? jminF : 0; - const int k2_lo = (kminF > 0) ? kminF : 0; - const int i2_hi = ex1 - 2; - const int j2_hi = ex2 - 2; - const int k2_hi = ex3 - 2; + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO; + } - const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; - const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; - const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; - const int i4_hi = ex1 - 3; - const int j4_hi = ex2 - 3; - const int k4_hi = ex3 - 3; + /* 2nd-order pass: [-1, 0, +1] / (2*dx) */ + const int i2_lo = (iminF > 0) ? iminF : 0; + const int j2_lo = (jminF > 0) ? jminF : 0; + const int k2_lo = (kminF > 0) ? kminF : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; - if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { - for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { - const int kF = k0 + 1; - for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { - const int jF = j0 + 1; - for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0; // Fortran index for fh: kF = k0 (shift=0) + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0; + const size_t p = idx_ex(i0, j0, k0, ex); - fx[p] = d2dx * ( - -fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + - fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - ); + fx[p] = d2dx * ( + -fh[idx_fh_F_ord1(iF - 1, jF, kF, ex)] + + fh[idx_fh_F_ord1(iF + 1, jF, kF, ex)] + ); - fy[p] = d2dy * ( - -fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + - fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - ); + fy[p] = d2dy * ( + -fh[idx_fh_F_ord1(iF, jF - 1, kF, ex)] + + fh[idx_fh_F_ord1(iF, jF + 1, kF, ex)] + ); - fz[p] = d2dz * ( - -fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + - fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - ); + fz[p] = d2dz * ( + -fh[idx_fh_F_ord1(iF, jF, kF - 1, ex)] + + fh[idx_fh_F_ord1(iF, jF, kF + 1, ex)] + ); + } } } } + return; } +#elif (ghost_width == 3) + /* ---- 4th-order (original code) ------------------------------------ */ + { + const int ord = 2; // symmetry_bd ord - if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { - for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { - const int kF = k0 + 1; - for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { - const int jF = j0 + 1; - for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; - fx[p] = d12dx * ( - fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] - - EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + - EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - - fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] - ); + const double SoA[3] = { SYM1, SYM2, SYM3 }; - fy[p] = d12dy * ( - fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] - - EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + - EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - - fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] - ); + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; - fz[p] = d12dz * ( - fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] - - EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + - EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - - fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] - ); + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO; + } + + const int i2_lo = (iminF > 0) ? iminF : 0; + const int j2_lo = (jminF > 0) ? jminF : 0; + const int k2_lo = (kminF > 0) ? kminF : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; + + const int i4_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j4_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k4_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i4_hi = ex1 - 3; + const int j4_hi = ex2 - 3; + const int k4_hi = ex3 - 3; + + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d2dx * ( + -fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] + ); + + fy[p] = d2dy * ( + -fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] + ); + + fz[p] = d2dz * ( + -fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] + ); + } } } } - } - // free(fh); + if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d12dx * ( + fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] - + EIT * fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + + EIT * fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)] + ); + + fy[p] = d12dy * ( + fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] - + EIT * fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + + EIT * fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)] + ); + + fz[p] = d12dz * ( + fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] - + EIT * fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + + EIT * fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)] + ); + } + } + } + } + return; + } +#elif (ghost_width == 4) + /* ---- 6th-order ----------------------------------------------------- */ + { + const int ord = 3; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + /* Denominators */ + const double d60dx = ONE / F60 / dX; + const double d60dy = ONE / F60 / dY; + const double d60dz = ONE / F60 / dZ; + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO; + } + + /* 2nd-order pass: 3pt, widest */ + const int i2_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j2_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k2_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; + + /* 4th-order pass: 5pt */ + const int i4_lo = (iminF + 2 > 0) ? (iminF + 2) : 0; + const int j4_lo = (jminF + 2 > 0) ? (jminF + 2) : 0; + const int k4_lo = (kminF + 2 > 0) ? (kminF + 2) : 0; + const int i4_hi = ex1 - 3; + const int j4_hi = ex2 - 3; + const int k4_hi = ex3 - 3; + + /* 6th-order pass: 7pt, narrowest interior */ + const int i6_lo = (iminF + 3 > 0) ? (iminF + 3) : 0; + const int j6_lo = (jminF + 3 > 0) ? (jminF + 3) : 0; + const int k6_lo = (kminF + 3 > 0) ? (kminF + 3) : 0; + const int i6_hi = ex1 - 4; + const int j6_hi = ex2 - 4; + const int k6_hi = ex3 - 4; + + /* 2nd-order */ + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d2dx * ( + -fh[idx_fh_F(iF - 1, jF, kF, ex)] + + fh[idx_fh_F(iF + 1, jF, kF, ex)]); + fy[p] = d2dy * ( + -fh[idx_fh_F(iF, jF - 1, kF, ex)] + + fh[idx_fh_F(iF, jF + 1, kF, ex)]); + fz[p] = d2dz * ( + -fh[idx_fh_F(iF, jF, kF - 1, ex)] + + fh[idx_fh_F(iF, jF, kF + 1, ex)]); + } + } + } + } + + /* 4th-order overwrite */ + if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d12dx * ( + fh[idx_fh_F(iF - 2, jF, kF, ex)] - + EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] + + EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] - + fh[idx_fh_F(iF + 2, jF, kF, ex)]); + + fy[p] = d12dy * ( + fh[idx_fh_F(iF, jF - 2, kF, ex)] - + EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] + + EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] - + fh[idx_fh_F(iF, jF + 2, kF, ex)]); + + fz[p] = d12dz * ( + fh[idx_fh_F(iF, jF, kF - 2, ex)] - + EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] + + EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] - + fh[idx_fh_F(iF, jF, kF + 2, ex)]); + } + } + } + } + + /* 6th-order overwrite: [-1,+9,-45,0,+45,-9,+1] / (60*dx) */ + if (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi) { + for (int k0 = k6_lo; k0 <= k6_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j6_lo; j0 <= j6_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i6_lo; i0 <= i6_hi; ++i0) { + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d60dx * ( + -fh[idx_fh_F(iF - 3, jF, kF, ex)] + + F9 * fh[idx_fh_F(iF - 2, jF, kF, ex)] - + F45 * fh[idx_fh_F(iF - 1, jF, kF, ex)] + + F45 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - + F9 * fh[idx_fh_F(iF + 2, jF, kF, ex)] + + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + + fy[p] = d60dy * ( + -fh[idx_fh_F(iF, jF - 3, kF, ex)] + + F9 * fh[idx_fh_F(iF, jF - 2, kF, ex)] - + F45 * fh[idx_fh_F(iF, jF - 1, kF, ex)] + + F45 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - + F9 * fh[idx_fh_F(iF, jF + 2, kF, ex)] + + fh[idx_fh_F(iF, jF + 3, kF, ex)]); + + fz[p] = d60dz * ( + -fh[idx_fh_F(iF, jF, kF - 3, ex)] + + F9 * fh[idx_fh_F(iF, jF, kF - 2, ex)] - + F45 * fh[idx_fh_F(iF, jF, kF - 1, ex)] + + F45 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - + F9 * fh[idx_fh_F(iF, jF, kF + 2, ex)] + + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + } + } + } + } + return; + } +#elif (ghost_width == 5) + /* ---- 8th-order ----------------------------------------------------- */ + { + const int ord = 4; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3; + + const double SoA[3] = { SYM1, SYM2, SYM3 }; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + static double *fh_buf = NULL; + static size_t cap = 0; + if (fh_size > cap) { + free(fh_buf); + fh_buf = (double*)aligned_alloc(64, fh_size * sizeof(double)); + cap = fh_size; + } + double *fh = fh_buf; + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + const double d840dx = ONE / F840 / dX; + const double d840dy = ONE / F840 / dY; + const double d840dz = ONE / F840 / dZ; + const double d60dx = ONE / F60 / dX; + const double d60dy = ONE / F60 / dY; + const double d60dz = ONE / F60 / dZ; + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + const size_t all = (size_t)ex1 * (size_t)ex2 * (size_t)ex3; + for (size_t p = 0; p < all; ++p) { + fx[p] = ZEO; fy[p] = ZEO; fz[p] = ZEO; + } + + /* 2nd: 3pt, widest */ + const int i2_lo = (iminF + 2 > 0) ? (iminF + 2) : 0; + const int j2_lo = (jminF + 2 > 0) ? (jminF + 2) : 0; + const int k2_lo = (kminF + 2 > 0) ? (kminF + 2) : 0; + const int i2_hi = ex1 - 2; + const int j2_hi = ex2 - 2; + const int k2_hi = ex3 - 2; + + /* 4th: 5pt */ + const int i4_lo = (iminF + 3 > 0) ? (iminF + 3) : 0; + const int j4_lo = (jminF + 3 > 0) ? (jminF + 3) : 0; + const int k4_lo = (kminF + 3 > 0) ? (kminF + 3) : 0; + const int i4_hi = ex1 - 3; + const int j4_hi = ex2 - 3; + const int k4_hi = ex3 - 3; + + /* 6th: 7pt */ + const int i6_lo = (iminF + 4 > 0) ? (iminF + 4) : 0; + const int j6_lo = (jminF + 4 > 0) ? (jminF + 4) : 0; + const int k6_lo = (kminF + 4 > 0) ? (kminF + 4) : 0; + const int i6_hi = ex1 - 4; + const int j6_hi = ex2 - 4; + const int k6_hi = ex3 - 4; + + /* 8th: 9pt, narrowest */ + const int i8_lo = (iminF + 5 > 0) ? (iminF + 5) : 0; + const int j8_lo = (jminF + 5 > 0) ? (jminF + 5) : 0; + const int k8_lo = (kminF + 5 > 0) ? (kminF + 5) : 0; + const int i8_hi = ex1 - 5; + const int j8_hi = ex2 - 5; + const int k8_hi = ex3 - 5; + + if (i2_lo <= i2_hi && j2_lo <= j2_hi && k2_lo <= k2_hi) { + for (int k0 = k2_lo; k0 <= k2_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j2_lo; j0 <= j2_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i2_lo; i0 <= i2_hi; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d2dx * ( + -fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + + fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]); + fy[p] = d2dy * ( + -fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + + fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]); + fz[p] = d2dz * ( + -fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + + fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]); + } + } + } + } + + if (i4_lo <= i4_hi && j4_lo <= j4_hi && k4_lo <= k4_hi) { + for (int k0 = k4_lo; k0 <= k4_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j4_lo; j0 <= j4_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i4_lo; i0 <= i4_hi; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d12dx * ( + fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] - + EIT * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + + EIT * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] - + fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]); + + fy[p] = d12dy * ( + fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] - + EIT * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + + EIT * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] - + fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]); + + fz[p] = d12dz * ( + fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] - + EIT * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + + EIT * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] - + fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]); + } + } + } + } + + if (i6_lo <= i6_hi && j6_lo <= j6_hi && k6_lo <= k6_hi) { + for (int k0 = k6_lo; k0 <= k6_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j6_lo; j0 <= j6_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i6_lo; i0 <= i6_hi; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d60dx * ( + -fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] + + F9 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] - + F45 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + + F45 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] - + F9 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] + + fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]); + + fy[p] = d60dy * ( + -fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] + + F9 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] - + F45 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + + F45 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] - + F9 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] + + fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]); + + fz[p] = d60dz * ( + -fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] + + F9 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] - + F45 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + + F45 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] - + F9 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] + + fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]); + } + } + } + } + + /* 8th-order overwrite: [+3,-32,+168,-672,0,+672,-168,+32,-3] / (840*dx) */ + if (i8_lo <= i8_hi && j8_lo <= j8_hi && k8_lo <= k8_hi) { + for (int k0 = k8_lo; k0 <= k8_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j8_lo; j0 <= j8_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i8_lo; i0 <= i8_hi; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + fx[p] = d840dx * ( + +(double)3 * fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] - + F32 * fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] + + F168 * fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] - + F672 * fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + + F672 * fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)] - + F168 * fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)] + + F32 * fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)] - + (double)3 * fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]); + + fy[p] = d840dy * ( + +(double)3 * fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] - + F32 * fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] + + F168 * fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] - + F672 * fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + + F672 * fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)] - + F168 * fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)] + + F32 * fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)] - + (double)3 * fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]); + + fz[p] = d840dz * ( + +(double)3 * fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] - + F32 * fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] + + F168 * fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] - + F672 * fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + + F672 * fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)] - + F168 * fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)] + + F32 * fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)] - + (double)3 * fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]); + } + } + } + } + return; + } +#else +#error "fderivs_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)" +#endif } diff --git a/AMSS_NCKU_source/kodiss_c.C b/AMSS_NCKU_source/kodiss_c.C index 36f80ef..eceb56c 100644 --- a/AMSS_NCKU_source/kodiss_c.C +++ b/AMSS_NCKU_source/kodiss_c.C @@ -1,16 +1,16 @@ +#include "macrodef.h" #include "tool.h" /* - * C 版 kodis + * C 版 kodis — Kreiss-Oliger numerical dissipation (Cartesian patches). * - * Fortran signature: - * subroutine kodis(ex,X,Y,Z,f,f_rhs,SoA,Symmetry,eps) + * The KO operator is (D₊D₋)^r applied to f_rhs with alternating sign (-1)^(r-1). * - * 约定: - * X: ex1, Y: ex2, Z: ex3 - * f, f_rhs: ex1*ex2*ex3 按 idx_ex 布局 - * SoA[3] - * eps: double + * FD order → r → cof=2^(2r) mapping: + * ghost_width=2 (2nd) → r=2, cof=16, sign=- + * ghost_width=3 (4th) → r=3, cof=64, sign=+ + * ghost_width=4 (6th) → r=4, cof=256, sign=- + * ghost_width=5 (8th) → r=5, cof=1024,sign=+ */ void kodis(const int ex[3], const double *X, const double *Y, const double *Z, @@ -18,100 +18,304 @@ void kodis(const int ex[3], const double SoA[3], int Symmetry, double eps) { - const double ONE = 1.0, SIX = 6.0, FIT = 15.0, TWT = 20.0; - const double cof = 64.0; // 2^6 - const int NO_SYMM = 0, OCTANT = 2; - + const double ZEO = 0.0; const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; - - // Fortran: dX = X(2)-X(1) -> C: X[1]-X[0] const double dX = X[1] - X[0]; const double dY = Y[1] - Y[0]; const double dZ = Z[1] - Z[0]; - (void)ONE; // ONE 在原 Fortran 里只是参数,这里不一定用得上 - // Fortran: imax=ex(1) 等是 1-based 上界 - const int imaxF = ex1; - const int jmaxF = ex2; - const int kmaxF = ex3; + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; - // Fortran: imin=jmin=kmin=1,某些对称情况变 -2 - int iminF = 1, jminF = 1, kminF = 1; +#if (ghost_width == 2) + /* ---- r=2, cof=16, sign=-, 5pt stencil ----------------------------- */ + { + const int ord = 2; + const int r = 2; + const double cof = 16.0; + const double F4 = 4.0, F6 = 6.0; + const int NO_SYMM = 0, EQ_SYMM = 1; - if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; - if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2; - if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; - // 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3),对应 ord=3 - const size_t nx = (size_t)ex1 + 3; - const size_t ny = (size_t)ex2 + 3; - const size_t nz = (size_t)ex3 + 3; - const size_t fh_size = nx * ny * nz; + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; - double *fh = (double*)malloc(fh_size * sizeof(double)); - if (!fh) return; + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; - // Fortran: call symmetry_bd(3,ex,f,fh,SoA) - symmetry_bd(3, ex, f, fh, SoA); + symmetry_bd(ord, ex, f, fh, SoA); - /* - * Fortran loops: - * do k=1,ex3 - * do j=1,ex2 - * do i=1,ex1 - * - * C: k0=0..ex3-1, j0=0..ex2-1, i0=0..ex1-1 - * 并定义 Fortran index: iF=i0+1, ... - */ - // 收紧循环范围:只遍历满足 iF±3/jF±3/kF±3 条件的内部点 - // iF-3 >= iminF => iF >= iminF+3 => i0 >= iminF+2 (因为 iF=i0+1) - // iF+3 <= imaxF => iF <= imaxF-3 => i0 <= imaxF-4 - const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0; - const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0; - const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0; - const int i0_hi = imaxF - 4; // inclusive - const int j0_hi = jmaxF - 4; - const int k0_hi = kmaxF - 4; + /* i±2 must be valid: i-2 >= iminF && i+2 <= imaxF + C 0-based: i0 >= iminF+1, i0 <= ex1-3 */ + const int i0_lo = (iminF + 1 > 0) ? (iminF + 1) : 0; + const int j0_lo = (jminF + 1 > 0) ? (jminF + 1) : 0; + const int k0_lo = (kminF + 1 > 0) ? (kminF + 1) : 0; + const int i0_hi = imaxF - 3; + const int j0_hi = jmaxF - 3; + const int k0_hi = kmaxF - 3; - if (i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi) { + if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) { + for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { + const int kF = k0 + 1; + for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { + const int jF = j0 + 1; + for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); + + const double Dx = ( + (fh[idx_fh_F_ord2(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 2, jF, kF, ex)]) - + F4 * (fh[idx_fh_F_ord2(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord2(iF + 1, jF, kF, ex)]) + + F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + ) / dX; + + const double Dy = ( + (fh[idx_fh_F_ord2(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 2, kF, ex)]) - + F4 * (fh[idx_fh_F_ord2(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord2(iF, jF + 1, kF, ex)]) + + F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + ) / dY; + + const double Dz = ( + (fh[idx_fh_F_ord2(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 2, ex)]) - + F4 * (fh[idx_fh_F_ord2(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord2(iF, jF, kF + 1, ex)]) + + F6 * fh[idx_fh_F_ord2(iF, jF, kF, ex)] + ) / dZ; + + f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */ + } + } + } + } free(fh); return; } +#elif (ghost_width == 3) + /* ---- r=3, cof=64, sign=+, 7pt stencil (current default) ---------- */ + { + const int ord = 3; + const int r = 3; + const double cof = 64.0; + const double SIX = 6.0, FIT = 15.0, TWT = 20.0; + const int NO_SYMM = 0, OCTANT = 2; - for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { - const int kF = k0 + 1; - for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { - const int jF = j0 + 1; - for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { - const int iF = i0 + 1; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry == OCTANT && fabs(X[0]) < dX) iminF = -2; + if (Symmetry == OCTANT && fabs(Y[0]) < dY) jminF = -2; - const size_t p = idx_ex(i0, j0, k0, ex); + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; - // 三个方向各一份同型的 7 点组合(实际上是对称的 6th-order dissipation/filter 核) - const double Dx_term = - ( (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) - - SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) + - FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) - - TWT * fh[idx_fh_F(iF , jF, kF, ex)] ) / dX; + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; - const double Dy_term = - ( (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) - - SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) + - FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) - - TWT * fh[idx_fh_F(iF, jF , kF, ex)] ) / dY; + symmetry_bd(ord, ex, f, fh, SoA); - const double Dz_term = - ( (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) - - SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) + - FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) - - TWT * fh[idx_fh_F(iF, jF, kF , ex)] ) / dZ; + const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0; + const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0; + const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0; + const int i0_hi = imaxF - 4; + const int j0_hi = jmaxF - 4; + const int k0_hi = kmaxF - 4; - // Fortran: - // f_rhs(i,j,k) = f_rhs(i,j,k) + eps/cof*(Dx_term + Dy_term + Dz_term) - f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term); + if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) { + for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { + const int kF = k0 + 2; + for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { + const int jF = j0 + 2; + for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + const double Dx = ( + (fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) - + SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) + + FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF, ex)] + ) / dX; + + const double Dy = ( + (fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) - + SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) + + FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF, ex)] + ) / dY; + + const double Dz = ( + (fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) - + SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) + + FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) - + TWT * fh[idx_fh_F(iF, jF, kF, ex)] + ) / dZ; + + f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */ + } + } } } + free(fh); + return; } +#elif (ghost_width == 4) + /* ---- r=4, cof=256, sign=-, 9pt stencil ---------------------------- */ + { + const int ord = 4; + const int r = 4; + const double cof = 256.0; + const double F8 = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0; + const int NO_SYMM = 0, EQ_SYMM = 1; - free(fh); -} \ No newline at end of file + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + /* i±4 valid: i-4>=iminF → i0>=iminF+3, i+4<=imaxF → i0<=ex1-5 */ + const int i0_lo = (iminF + 3 > 0) ? iminF + 3 : 0; + const int j0_lo = (jminF + 3 > 0) ? jminF + 3 : 0; + const int k0_lo = (kminF + 3 > 0) ? kminF + 3 : 0; + const int i0_hi = imaxF - 5; + const int j0_hi = jmaxF - 5; + const int k0_hi = kmaxF - 5; + + if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) { + for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { + const int kF = k0 + 3; + for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { + const int jF = j0 + 3; + for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* Stencil: [1,-8,28,-56,70,-56,28,-8,1] */ + const double Dx = ( + (fh[idx_fh_F_ord4(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 4, jF, kF, ex)]) - + F8 * (fh[idx_fh_F_ord4(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 3, jF, kF, ex)]) + + F28* (fh[idx_fh_F_ord4(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 2, jF, kF, ex)]) - + F56* (fh[idx_fh_F_ord4(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord4(iF + 1, jF, kF, ex)]) + + F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)] + ) / dX; + + const double Dy = ( + (fh[idx_fh_F_ord4(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 4, kF, ex)]) - + F8 * (fh[idx_fh_F_ord4(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 3, kF, ex)]) + + F28* (fh[idx_fh_F_ord4(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 2, kF, ex)]) - + F56* (fh[idx_fh_F_ord4(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord4(iF, jF + 1, kF, ex)]) + + F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)] + ) / dY; + + const double Dz = ( + (fh[idx_fh_F_ord4(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 4, ex)]) - + F8 * (fh[idx_fh_F_ord4(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 3, ex)]) + + F28* (fh[idx_fh_F_ord4(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 2, ex)]) - + F56* (fh[idx_fh_F_ord4(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord4(iF, jF, kF + 1, ex)]) + + F70* fh[idx_fh_F_ord4(iF, jF, kF, ex)] + ) / dZ; + + f_rhs[p] -= (eps / cof) * (Dx + Dy + Dz); /* sign=- */ + } + } + } + } + free(fh); + return; + } +#elif (ghost_width == 5) + /* ---- r=5, cof=1024, sign=+, 11pt stencil ------------------------- */ + { + const int ord = 5; + const int r = 5; + const double cof = 1024.0; + const double F10 = 10.0, F45 = 45.0, F120 = 120.0; + const double F210 = 210.0, F252 = 252.0; + const int NO_SYMM = 0, EQ_SYMM = 1; + + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + + symmetry_bd(ord, ex, f, fh, SoA); + + /* i±5 valid: i0>=iminF+4, i0<=ex1-6 */ + const int i0_lo = (iminF + 4 > 0) ? iminF + 4 : 0; + const int j0_lo = (jminF + 4 > 0) ? jminF + 4 : 0; + const int k0_lo = (kminF + 4 > 0) ? kminF + 4 : 0; + const int i0_hi = imaxF - 6; + const int j0_hi = jmaxF - 6; + const int k0_hi = kmaxF - 6; + + if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) { + for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { + const int kF = k0 + 4; + for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { + const int jF = j0 + 4; + for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { + const int iF = i0 + 4; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* Stencil: [1,-10,45,-120,210,-252,210,-120,45,-10,1] */ + const double Dx = ( + (fh[idx_fh_F_ord5(iF - 5, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 5, jF, kF, ex)]) - + F10 * (fh[idx_fh_F_ord5(iF - 4, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 4, jF, kF, ex)]) + + F45 * (fh[idx_fh_F_ord5(iF - 3, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 3, jF, kF, ex)]) - + F120* (fh[idx_fh_F_ord5(iF - 2, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 2, jF, kF, ex)]) + + F210* (fh[idx_fh_F_ord5(iF - 1, jF, kF, ex)] + fh[idx_fh_F_ord5(iF + 1, jF, kF, ex)]) - + F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)] + ) / dX; + + const double Dy = ( + (fh[idx_fh_F_ord5(iF, jF - 5, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 5, kF, ex)]) - + F10 * (fh[idx_fh_F_ord5(iF, jF - 4, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 4, kF, ex)]) + + F45 * (fh[idx_fh_F_ord5(iF, jF - 3, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 3, kF, ex)]) - + F120* (fh[idx_fh_F_ord5(iF, jF - 2, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 2, kF, ex)]) + + F210* (fh[idx_fh_F_ord5(iF, jF - 1, kF, ex)] + fh[idx_fh_F_ord5(iF, jF + 1, kF, ex)]) - + F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)] + ) / dY; + + const double Dz = ( + (fh[idx_fh_F_ord5(iF, jF, kF - 5, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 5, ex)]) - + F10 * (fh[idx_fh_F_ord5(iF, jF, kF - 4, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 4, ex)]) + + F45 * (fh[idx_fh_F_ord5(iF, jF, kF - 3, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 3, ex)]) - + F120* (fh[idx_fh_F_ord5(iF, jF, kF - 2, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 2, ex)]) + + F210* (fh[idx_fh_F_ord5(iF, jF, kF - 1, ex)] + fh[idx_fh_F_ord5(iF, jF, kF + 1, ex)]) - + F252* fh[idx_fh_F_ord5(iF, jF, kF, ex)] + ) / dZ; + + f_rhs[p] += (eps / cof) * (Dx + Dy + Dz); /* sign=+ */ + } + } + } + } + free(fh); + return; + } +#else +#error "kodiss_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)" +#endif +} diff --git a/AMSS_NCKU_source/lopsided_c.C b/AMSS_NCKU_source/lopsided_c.C index fe2c6e4..8bd9f33 100644 --- a/AMSS_NCKU_source/lopsided_c.C +++ b/AMSS_NCKU_source/lopsided_c.C @@ -1,14 +1,13 @@ +#include "macrodef.h" #include "tool.h" + /* - * 你需要提供 symmetry_bd 的 C 版本(或 Fortran 绑到 C 的接口)。 - * Fortran: call symmetry_bd(3,ex,f,fh,SoA) + * C 版 lopsided — upwind (lopsided) advection derivatives. * - * 约定: - * nghost = 3 - * ex[3] = {ex1,ex2,ex3} - * f = 原始网格 (ex1*ex2*ex3) - * fh = 扩展网格 ((ex1+3)*(ex2+3)*(ex3+3)),对应 Fortran 的 (-2:ex1, ...) - * SoA[3] = 输入参数 + * Adds advection terms to f_rhs for all three spatial directions. + * Uses sign-biased (one-sided) stencils with centered fallbacks. + * + * For lopsided, symmetry_bd ord = ghost_width (same as kodiss). */ void lopsided(const int ex[3], const double *X, const double *Y, const double *Z, @@ -16,240 +15,577 @@ void lopsided(const int ex[3], const double *Sfx, const double *Sfy, const double *Sfz, int Symmetry, const double SoA[3]) { - const double ZEO = 0.0, ONE = 1.0, F3 = 3.0; - const double TWO = 2.0, F6 = 6.0, F18 = 18.0; - const double F12 = 12.0, F10 = 10.0, EIT = 8.0; - - const int NO_SYMM = 0, EQ_SYMM = 1, OCTANT = 2; - (void)OCTANT; // 这里和 Fortran 一样只是定义了不用也没关系 + const double ZEO = 0.0, ONE = 1.0; + const double TWO = 2.0, F6 = 6.0, EIT = 8.0; + const double F3 = 3.0, F4 = 4.0, F5 = 5.0, F10 = 10.0, F12 = 12.0, F18 = 18.0; + const double F9 = 9.0, F45 = 45.0, F60 = 60.0; + const double F2 = 2.0, F15 = 15.0, F24 = 24.0, F30 = 30.0, F35 = 35.0; + const double F50 = 50.0, F77 = 77.0, F80 = 80.0, F100 = 100.0, F150 = 150.0; + const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0; + const double F140=140.0, F378=378.0, F420=420.0, F1050=1050.0; + const int NO_SYMM = 0, EQ_SYMM = 1; const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; - // 对应 Fortran: dX = X(2)-X(1) (Fortran 1-based) - // C: X[1]-X[0] const double dX = X[1] - X[0]; const double dY = Y[1] - Y[0]; const double dZ = Z[1] - Z[0]; - const double d12dx = ONE / F12 / dX; - const double d12dy = ONE / F12 / dY; - const double d12dz = ONE / F12 / dZ; +#if (ghost_width == 2) + /* ---- 2nd-order lopsided --------------------------------------------- */ + { + const int ord = 2; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; - // Fortran 里算了 d2dx/d2dy/d2dz 但本 subroutine 里没用到(保持一致也算出来) - const double d2dx = ONE / TWO / dX; - const double d2dy = ONE / TWO / dY; - const double d2dz = ONE / TWO / dZ; - (void)d2dx; (void)d2dy; (void)d2dz; + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; - // Fortran: - // imax = ex(1); jmax = ex(2); kmax = ex(3) - const int imaxF = ex1; - const int jmaxF = ex2; - const int kmaxF = ex3; + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); - // Fortran: - // imin=jmin=kmin=1; 若满足对称条件则设为 -2 - int iminF = 1, jminF = 1, kminF = 1; - if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; - if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; - if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; - // 分配 fh:大小 (ex1+3)*(ex2+3)*(ex3+3) - const size_t nx = (size_t)ex1 + 3; - const size_t ny = (size_t)ex2 + 3; - const size_t nz = (size_t)ex3 + 3; - const size_t fh_size = nx * ny * nz; + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; - double *fh = (double*)malloc(fh_size * sizeof(double)); - if (!fh) return; // 内存不足:直接返回(你也可以改成 abort/报错) + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 1; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 1; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 1; + const size_t p = idx_ex(i0, j0, k0, ex); - // Fortran: call symmetry_bd(3,ex,f,fh,SoA) - symmetry_bd(3, ex, f, fh, SoA); - - /* - * Fortran 主循环: - * do k=1,ex(3)-1 - * do j=1,ex(2)-1 - * do i=1,ex(1)-1 - * - * 转成 C 0-based: - * k0 = 0..ex3-2, j0 = 0..ex2-2, i0 = 0..ex1-2 - * - * 并且 Fortran 里的 i/j/k 在 fh 访问时,仍然是 Fortran 索引值: - * iF=i0+1, jF=j0+1, kF=k0+1 - */ - for (int k0 = 0; k0 <= ex3 - 2; ++k0) { - const int kF = k0 + 1; - for (int j0 = 0; j0 <= ex2 - 2; ++j0) { - const int jF = j0 + 1; - for (int i0 = 0; i0 <= ex1 - 2; ++i0) { - const int iF = i0 + 1; - - const size_t p = idx_ex(i0, j0, k0, ex); - - // ---------------- x direction ---------------- - const double sfx = Sfx[p]; - if (sfx > ZEO) { - // Fortran: if(i+3 <= imax) - // iF+3 <= ex1 <=> i0+4 <= ex1 <=> i0 <= ex1-4 - if (i0 <= ex1 - 4) { - f_rhs[p] += sfx * d12dx * - (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] - + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + /* x-direction */ + const double sfx = Sfx[p]; + if (sfx > ZEO) { + if (i0 <= ex1 - 3) // i+2 <= imax + f_rhs[p] += sfx * d2dx * ( + -F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + F4*fh[idx_fh_F_ord2(iF+1, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF+2, jF, kF, ex)]); + else if (i0 <= ex1 - 2) // i+1 <= imax + f_rhs[p] += sfx * d2dx * ( + -fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF+1, jF, kF, ex)]); + } else if (sfx < ZEO) { + if ((i0 - 1) >= iminF) // i-2 >= imin + f_rhs[p] -= sfx * d2dx * ( + -F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + F4*fh[idx_fh_F_ord2(iF-1, jF, kF, ex)] - + fh[idx_fh_F_ord2(iF-2, jF, kF, ex)]); + else if (i0 >= iminF) // i-1 >= imin + f_rhs[p] -= sfx * d2dx * ( + -fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF-1, jF, kF, ex)]); } - // elseif(i+2 <= imax) <=> i0 <= ex1-3 - else if (i0 <= ex1 - 3) { - f_rhs[p] += sfx * d12dx * - ( fh[idx_fh_F(iF - 2, jF, kF, ex)] - -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] - +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] - - fh[idx_fh_F(iF + 2, jF, kF, ex)]); - } - // elseif(i+1 <= imax) <=> i0 <= ex1-2(循环里总成立) - else if (i0 <= ex1 - 2) { - f_rhs[p] -= sfx * d12dx * - (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] - + fh[idx_fh_F(iF - 3, jF, kF, ex)]); - } - } else if (sfx < ZEO) { - // Fortran: if(i-3 >= imin) - // (iF-3) >= iminF <=> (i0-2) >= iminF - if ((i0 - 2) >= iminF) { - f_rhs[p] -= sfx * d12dx * - (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] - + fh[idx_fh_F(iF - 3, jF, kF, ex)]); - } - // elseif(i-2 >= imin) <=> (i0-1) >= iminF - else if ((i0 - 1) >= iminF) { - f_rhs[p] += sfx * d12dx * - ( fh[idx_fh_F(iF - 2, jF, kF, ex)] - -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] - +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] - - fh[idx_fh_F(iF + 2, jF, kF, ex)]); - } - // elseif(i-1 >= imin) <=> i0 >= iminF - else if (i0 >= iminF) { - f_rhs[p] += sfx * d12dx * - (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] - + fh[idx_fh_F(iF + 3, jF, kF, ex)]); - } - } - // ---------------- y direction ---------------- - const double sfy = Sfy[p]; - if (sfy > ZEO) { - // jF+3 <= ex2 <=> j0+4 <= ex2 <=> j0 <= ex2-4 - if (j0 <= ex2 - 4) { - f_rhs[p] += sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] - + fh[idx_fh_F(iF, jF + 3, kF, ex)]); - } else if (j0 <= ex2 - 3) { - f_rhs[p] += sfy * d12dy * - ( fh[idx_fh_F(iF, jF - 2, kF, ex)] - -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] - +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] - - fh[idx_fh_F(iF, jF + 2, kF, ex)]); - } else if (j0 <= ex2 - 2) { - f_rhs[p] -= sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] - + fh[idx_fh_F(iF, jF - 3, kF, ex)]); + /* y-direction */ + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0 <= ex2-3) + f_rhs[p] += sfy * d2dy * ( + -F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + F4*fh[idx_fh_F_ord2(iF, jF+1, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF+2, kF, ex)]); + else if (j0 <= ex2-2) + f_rhs[p] += sfy * d2dy * ( + -fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF+1, kF, ex)]); + } else if (sfy < ZEO) { + if ((j0-1) >= jminF) + f_rhs[p] -= sfy * d2dy * ( + -F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + F4*fh[idx_fh_F_ord2(iF, jF-1, kF, ex)] - + fh[idx_fh_F_ord2(iF, jF-2, kF, ex)]); + else if (j0 >= jminF) + f_rhs[p] -= sfy * d2dy * ( + -fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF-1, kF, ex)]); } - } else if (sfy < ZEO) { - if ((j0 - 2) >= jminF) { - f_rhs[p] -= sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] - + fh[idx_fh_F(iF, jF - 3, kF, ex)]); - } else if ((j0 - 1) >= jminF) { - f_rhs[p] += sfy * d12dy * - ( fh[idx_fh_F(iF, jF - 2, kF, ex)] - -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] - +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] - - fh[idx_fh_F(iF, jF + 2, kF, ex)]); - } else if (j0 >= jminF) { - f_rhs[p] += sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] - + fh[idx_fh_F(iF, jF + 3, kF, ex)]); - } - } - // ---------------- z direction ---------------- - const double sfz = Sfz[p]; - if (sfz > ZEO) { - if (k0 <= ex3 - 4) { - f_rhs[p] += sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] - + fh[idx_fh_F(iF, jF, kF + 3, ex)]); - } else if (k0 <= ex3 - 3) { - f_rhs[p] += sfz * d12dz * - ( fh[idx_fh_F(iF, jF, kF - 2, ex)] - -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] - +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] - - fh[idx_fh_F(iF, jF, kF + 2, ex)]); - } else if (k0 <= ex3 - 2) { - f_rhs[p] -= sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] - + fh[idx_fh_F(iF, jF, kF - 3, ex)]); - } - } else if (sfz < ZEO) { - if ((k0 - 2) >= kminF) { - f_rhs[p] -= sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] - + fh[idx_fh_F(iF, jF, kF - 3, ex)]); - } else if ((k0 - 1) >= kminF) { - f_rhs[p] += sfz * d12dz * - ( fh[idx_fh_F(iF, jF, kF - 2, ex)] - -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] - +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] - - fh[idx_fh_F(iF, jF, kF + 2, ex)]); - } else if (k0 >= kminF) { - f_rhs[p] += sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] - + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + /* z-direction */ + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0 <= ex3-3) + f_rhs[p] += sfz * d2dz * ( + -F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + F4*fh[idx_fh_F_ord2(iF, jF, kF+1, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF+2, ex)]); + else if (k0 <= ex3-2) + f_rhs[p] += sfz * d2dz * ( + -fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF+1, ex)]); + } else if (sfz < ZEO) { + if ((k0-1) >= kminF) + f_rhs[p] -= sfz * d2dz * ( + -F3*fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + F4*fh[idx_fh_F_ord2(iF, jF, kF-1, ex)] - + fh[idx_fh_F_ord2(iF, jF, kF-2, ex)]); + else if (k0 >= kminF) + f_rhs[p] -= sfz * d2dz * ( + -fh[idx_fh_F_ord2(iF, jF, kF, ex)] + + fh[idx_fh_F_ord2(iF, jF, kF-1, ex)]); } } } } + free(fh); + return; } - free(fh); +#elif (ghost_width == 3) + /* ---- 4th-order lopsided (original code) ---------------------------- */ + { + const int ord = 3; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); + + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; + + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 2; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 2; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 2; + const size_t p = idx_ex(i0, j0, k0, ex); + + const double sfx = Sfx[p]; + if (sfx > ZEO) { + if (i0 <= ex1 - 4) // i+3 <= imax + f_rhs[p] += sfx * d12dx * ( + -F3 *fh[idx_fh_F(iF-1, jF, kF, ex)] + -F10*fh[idx_fh_F(iF, jF, kF, ex)] + +F18*fh[idx_fh_F(iF+1, jF, kF, ex)] + -F6 *fh[idx_fh_F(iF+2, jF, kF, ex)] + + fh[idx_fh_F(iF+3, jF, kF, ex)]); + else if (i0 <= ex1 - 3) // i+2 <= imax + f_rhs[p] += sfx * d12dx * ( + fh[idx_fh_F(iF-2, jF, kF, ex)] + -EIT*fh[idx_fh_F(iF-1, jF, kF, ex)] + +EIT*fh[idx_fh_F(iF+1, jF, kF, ex)] + - fh[idx_fh_F(iF+2, jF, kF, ex)]); + else if (i0 <= ex1 - 2) // i+1 <= imax → mirrored + f_rhs[p] -= sfx * d12dx * ( + -F3 *fh[idx_fh_F(iF+1, jF, kF, ex)] + -F10*fh[idx_fh_F(iF, jF, kF, ex)] + +F18*fh[idx_fh_F(iF-1, jF, kF, ex)] + -F6 *fh[idx_fh_F(iF-2, jF, kF, ex)] + + fh[idx_fh_F(iF-3, jF, kF, ex)]); + } else if (sfx < ZEO) { + if ((i0 - 2) >= iminF) // i-3 >= imin + f_rhs[p] -= sfx * d12dx * ( + -F3 *fh[idx_fh_F(iF+1, jF, kF, ex)] + -F10*fh[idx_fh_F(iF, jF, kF, ex)] + +F18*fh[idx_fh_F(iF-1, jF, kF, ex)] + -F6 *fh[idx_fh_F(iF-2, jF, kF, ex)] + + fh[idx_fh_F(iF-3, jF, kF, ex)]); + else if ((i0 - 1) >= iminF) // i-2 >= imin + f_rhs[p] += sfx * d12dx * ( + fh[idx_fh_F(iF-2, jF, kF, ex)] + -EIT*fh[idx_fh_F(iF-1, jF, kF, ex)] + +EIT*fh[idx_fh_F(iF+1, jF, kF, ex)] + - fh[idx_fh_F(iF+2, jF, kF, ex)]); + else if (i0 >= iminF) // i-1 >= imin → mirrored + f_rhs[p] += sfx * d12dx * ( + -F3 *fh[idx_fh_F(iF-1, jF, kF, ex)] + -F10*fh[idx_fh_F(iF, jF, kF, ex)] + +F18*fh[idx_fh_F(iF+1, jF, kF, ex)] + -F6 *fh[idx_fh_F(iF+2, jF, kF, ex)] + + fh[idx_fh_F(iF+3, jF, kF, ex)]); + } + + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0 <= ex2-4) + f_rhs[p] += sfy * d12dy * ( + -F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)] + +fh[idx_fh_F(iF,jF+3,kF,ex)]); + else if (j0 <= ex2-3) + f_rhs[p] += sfy * d12dy * (fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]); + else if (j0 <= ex2-2) + f_rhs[p] -= sfy * d12dy * ( + -F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)] + +fh[idx_fh_F(iF,jF-3,kF,ex)]); + } else if (sfy < ZEO) { + if ((j0-2) >= jminF) + f_rhs[p] -= sfy * d12dy * ( + -F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)] + +fh[idx_fh_F(iF,jF-3,kF,ex)]); + else if ((j0-1) >= jminF) + f_rhs[p] += sfy * d12dy * (fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]); + else if (j0 >= jminF) + f_rhs[p] += sfy * d12dy * ( + -F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)] + +fh[idx_fh_F(iF,jF+3,kF,ex)]); + } + + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0 <= ex3-4) + f_rhs[p] += sfz * d12dz * ( + -F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)] + +fh[idx_fh_F(iF,jF,kF+3,ex)]); + else if (k0 <= ex3-3) + f_rhs[p] += sfz * d12dz * (fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]); + else if (k0 <= ex3-2) + f_rhs[p] -= sfz * d12dz * ( + -F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)] + +fh[idx_fh_F(iF,jF,kF-3,ex)]); + } else if (sfz < ZEO) { + if ((k0-2) >= kminF) + f_rhs[p] -= sfz * d12dz * ( + -F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)] + +fh[idx_fh_F(iF,jF,kF-3,ex)]); + else if ((k0-1) >= kminF) + f_rhs[p] += sfz * d12dz * (fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]); + else if (k0 >= kminF) + f_rhs[p] += sfz * d12dz * ( + -F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)] + +F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)] + +fh[idx_fh_F(iF,jF,kF+3,ex)]); + } + } + } + } + free(fh); + return; + } +#elif (ghost_width == 4) + /* ---- 6th-order lopsided --------------------------------------------- */ + { + const int ord = 4; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); + + const double d60dx = ONE / F60 / dX; + const double d60dy = ONE / F60 / dY; + const double d60dz = ONE / F60 / dZ; + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; + + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 3; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 3; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 3; + const size_t p = idx_ex(i0, j0, k0, ex); + + /* ---- x-direction ---- */ + const double sfx = Sfx[p]; + if (sfx > ZEO) { + /* Primary biased: 2*f(i-2)-24*f(i-1)-35*f(i)+80*f(i+1)-30*f(i+2)+8*f(i+3)-f(i+4) */ + if (i0 <= ex1-5 && (i0-1)>=iminF) // i+4<=imax && i-2>=imin + f_rhs[p] += sfx * d60dx * ( + +F2*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] + -F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] + -F30*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)] + -fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]); + /* Boundary-adapted: -10*f(i-1)-77*f(i)+150*f(i+1)-100*f(i+2)+50*f(i+3)-15*f(i+4)+2*f(i+5) */ + else if (i0 <= ex1-6 && i0 >= iminF) // i+5<=imax && i-1>=imin + f_rhs[p] += sfx * d60dx * ( + -F10*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + +F150*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)] + +F50*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF+4,jF,kF,ex)] + +F2*fh[idx_fh_F_ord4(iF+5,jF,kF,ex)]); + /* Centered fallbacks */ + else if (i0 <= ex1-4 && (i0-2)>=iminF) // 6th: i+3<=imax && i-3>=imin + f_rhs[p] += sfx * d60dx * ( + -fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + -F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] + -F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]); + else if (i0 <= ex1-3 && (i0-1)>=iminF) // 4th + f_rhs[p] += sfx * d12dx * ( + fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] + +EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]); + else if (i0 <= ex1-2 && i0>=iminF) // 2nd + f_rhs[p] += sfx * d2dx * ( + -fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]); + } else if (sfx < ZEO) { + if ((i0-4)>=iminF && i0<=ex1-2) // i-4>=imin && i+2<=imax + f_rhs[p] -= sfx * d60dx * ( + +F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] + -F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] + -F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)] + -fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]); + else if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+1<=imax + f_rhs[p] -= sfx * d60dx * ( + -F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)] + +F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + +F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)] + +F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]); + else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered + f_rhs[p] -= sfx * d60dx * ( + -fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)] + -F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)] + -F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]); + else if ((i0-2)>=iminF && i0<=ex1-2) // 4th + f_rhs[p] -= sfx * d12dx * ( + fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)] + +EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]); + else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd + f_rhs[p] -= sfx * d2dx * ( + -fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]); + } + + /* ---- y-direction ---- */ + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0<=ex2-5 && (j0-1)>=jminF) + f_rhs[p] += sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]); + else if (j0<=ex2-6 && j0>=jminF) + f_rhs[p] += sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF+5,kF,ex)]); + else if (j0<=ex2-4 && (j0-2)>=jminF) + f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]); + else if (j0<=ex2-3 && (j0-1)>=jminF) + f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]); + else if (j0<=ex2-2 && j0>=jminF) + f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]); + } else if (sfy < ZEO) { + if ((j0-4)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]); + else if ((j0-5)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]); + else if ((j0-3)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]); + else if ((j0-2)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]); + else if ((j0-1)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]); + } + + /* ---- z-direction ---- */ + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0<=ex3-5 && (k0-1)>=kminF) + f_rhs[p] += sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]); + else if (k0<=ex3-6 && k0>=kminF) + f_rhs[p] += sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF+5,ex)]); + else if (k0<=ex3-4 && (k0-2)>=kminF) + f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]); + else if (k0<=ex3-3 && (k0-1)>=kminF) + f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]); + else if (k0<=ex3-2 && k0>=kminF) + f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]); + } else if (sfz < ZEO) { + if ((k0-4)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]); + else if ((k0-5)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]); + else if ((k0-3)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]); + else if ((k0-2)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]); + else if ((k0-1)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]); + } + } + } + } + free(fh); + return; + } +#elif (ghost_width == 5) + /* ---- 8th-order lopsided --------------------------------------------- */ + { + const int ord = 5; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + const size_t fh_size = nx * ny * nz; + + double *fh = (double*)malloc(fh_size * sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); + + const double d840dx = ONE / F840 / dX; + const double d840dy = ONE / F840 / dY; + const double d840dz = ONE / F840 / dZ; + const double d60dx = ONE / F60 / dX; + const double d60dy = ONE / F60 / dY; + const double d60dz = ONE / F60 / dZ; + const double d12dx = ONE / F12 / dX; + const double d12dy = ONE / F12 / dY; + const double d12dz = ONE / F12 / dZ; + const double d2dx = ONE / TWO / dX; + const double d2dy = ONE / TWO / dY; + const double d2dz = ONE / TWO / dZ; + + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; + + for (int k0 = 0; k0 <= ex3 - 2; ++k0) { + const int kF = k0 + 4; + for (int j0 = 0; j0 <= ex2 - 2; ++j0) { + const int jF = j0 + 4; + for (int i0 = 0; i0 <= ex1 - 2; ++i0) { + const int iF = i0 + 4; + const size_t p = idx_ex(i0, j0, k0, ex); + + const double sfx = Sfx[p]; + if (sfx > ZEO) { + /* 8th biased: -5*f(i-3)+60*f(i-2)-420*f(i-1)-378*f(i)+1050*f(i+1)-420*f(i+2)+140*f(i+3)-30*f(i+4)+3*f(i+5) */ + if (i0 <= ex1-6 && (i0-2)>=iminF) // i+5<=imax && i-3>=imin + f_rhs[p] += sfx * d840dx * ( + -F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)] + -F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + +F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)] + +F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)] + +F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]); + /* 8th centered: +3*f(i-4)-32*f(i-3)+168*f(i-2)-672*f(i-1)+672*f(i+1)-168*f(i+2)+32*f(i+3)-3*f(i+4) */ + else if (i0 <= ex1-5 && (i0-3)>=iminF) + f_rhs[p] += sfx * d840dx * ( + +F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)] + +F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] + +F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)] + +F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]); + else if (i0 <= ex1-4 && (i0-2)>=iminF) // 6th centered + f_rhs[p] += sfx * d60dx * ( + -fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)] + -F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)] + -F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]); + else if (i0 <= ex1-3 && (i0-1)>=iminF) // 4th centered + f_rhs[p] += sfx * d12dx * ( + fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] + +EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]); + else if (i0 <= ex1-2 && i0>=iminF) // 2nd centered + f_rhs[p] += sfx * d2dx * ( + -fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]); + } else if (sfx < ZEO) { + if ((i0-5)>=iminF && i0<=ex1-2) // i-5>=imin && i+3<=imax + f_rhs[p] -= sfx * d840dx * ( + -F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)] + -F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)] + +F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)] + +F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)] + +F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]); + else if ((i0-4)>=iminF && i0<=ex1-2) // 8th centered + f_rhs[p] -= sfx * d840dx * ( + +F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)] + +F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] + +F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)] + +F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]); + else if ((i0-3)>=iminF && i0<=ex1-2) // 6th centered + f_rhs[p] -= sfx * d60dx * ( + -fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)] + -F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)] + -F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]); + else if ((i0-2)>=iminF && i0<=ex1-2) // 4th centered + f_rhs[p] -= sfx * d12dx * ( + fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)] + +EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]); + else if ((i0-1)>=iminF && i0<=ex1-2) // 2nd centered + f_rhs[p] -= sfx * d2dx * ( + -fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]); + } + + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0<=ex2-6 && (j0-2)>=jminF) + f_rhs[p] += sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]); + else if (j0<=ex2-5 && (j0-3)>=jminF) + f_rhs[p] += sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]); + else if (j0<=ex2-4 && (j0-2)>=jminF) + f_rhs[p] += sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]); + else if (j0<=ex2-3 && (j0-1)>=jminF) + f_rhs[p] += sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]); + else if (j0<=ex2-2 && j0>=jminF) + f_rhs[p] += sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]); + } else if (sfy < ZEO) { + if ((j0-5)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]); + else if ((j0-4)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]); + else if ((j0-3)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]); + else if ((j0-2)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]); + else if ((j0-1)>=jminF && j0<=ex2-2) + f_rhs[p] -= sfy * d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]); + } + + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0<=ex3-6 && (k0-2)>=kminF) + f_rhs[p] += sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]); + else if (k0<=ex3-5 && (k0-3)>=kminF) + f_rhs[p] += sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]); + else if (k0<=ex3-4 && (k0-2)>=kminF) + f_rhs[p] += sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]); + else if (k0<=ex3-3 && (k0-1)>=kminF) + f_rhs[p] += sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]); + else if (k0<=ex3-2 && k0>=kminF) + f_rhs[p] += sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]); + } else if (sfz < ZEO) { + if ((k0-5)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]); + else if ((k0-4)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]); + else if ((k0-3)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]); + else if ((k0-2)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]); + else if ((k0-1)>=kminF && k0<=ex3-2) + f_rhs[p] -= sfz * d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]); + } + } + } + } + free(fh); + return; + } +#else +#error "lopsided_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)" +#endif } - - - - - diff --git a/AMSS_NCKU_source/lopsided_kodis_c.C b/AMSS_NCKU_source/lopsided_kodis_c.C index 970d0e7..0d9d635 100644 --- a/AMSS_NCKU_source/lopsided_kodis_c.C +++ b/AMSS_NCKU_source/lopsided_kodis_c.C @@ -1,8 +1,15 @@ +#include "macrodef.h" #include "tool.h" /* - * Combined advection (lopsided) + KO dissipation (kodis). - * Uses one shared symmetry_bd buffer per call. + * C 版 lopsided_kodis — combined upwind advection + KO dissipation. + * Uses one shared symmetry_bd buffer (ord = ghost_width for both components). + * + * FD order selection via ghost_width: + * 2 → 2nd-order advection + r=2 KO (cof=16, sign=-) + * 3 → 4th-order advection + r=3 KO (cof=64, sign=+) + * 4 → 6th-order advection + r=4 KO (cof=256, sign=-) + * 5 → 8th-order advection + r=5 KO (cof=1024, sign=+) */ void lopsided_kodis(const int ex[3], const double *X, const double *Y, const double *Z, @@ -10,239 +17,370 @@ void lopsided_kodis(const int ex[3], const double *Sfx, const double *Sfy, const double *Sfz, int Symmetry, const double SoA[3], double eps) { - const double ZEO = 0.0, ONE = 1.0, F3 = 3.0; - const double F6 = 6.0, F18 = 18.0; - const double F12 = 12.0, F10 = 10.0, EIT = 8.0; - const double SIX = 6.0, FIT = 15.0, TWT = 20.0; - const double cof = 64.0; // 2^6 + const double ZEO = 0.0, ONE = 1.0; + const double TWO = 2.0, F6 = 6.0, EIT = 8.0; + const double F3 = 3.0, F4 = 4.0, F5 = 5.0, F10 = 10.0, F12 = 12.0, F18 = 18.0; + const double F9 = 9.0, F45 = 45.0, F60 = 60.0; + const double F2 = 2.0, F15 = 15.0, F24 = 24.0, F30 = 30.0, F35 = 35.0; + const double F50 = 50.0, F77 = 77.0, F80 = 80.0, F100 = 100.0, F150 = 150.0; + const double F32 = 32.0, F168 = 168.0, F672 = 672.0, F840 = 840.0; + const double F140=140.0, F378=378.0, F420=420.0, F1050=1050.0; const int NO_SYMM = 0, EQ_SYMM = 1; - const int ex1 = ex[0], ex2 = ex[1], ex3 = ex[2]; const double dX = X[1] - X[0]; const double dY = Y[1] - Y[0]; const double dZ = Z[1] - Z[0]; - const double d12dx = ONE / F12 / dX; - const double d12dy = ONE / F12 / dY; - const double d12dz = ONE / F12 / dZ; + const int imaxF = ex1, jmaxF = ex2, kmaxF = ex3; - const int imaxF = ex1; - const int jmaxF = ex2; - const int kmaxF = ex3; +#if (ghost_width == 2) + { + const int ord = 2; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -1; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -1; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -1; - int iminF = 1, jminF = 1, kminF = 1; - if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; - if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; - if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + double *fh = (double*)malloc(nx*ny*nz*sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); - // fh for Fortran-style domain (-2:ex1,-2:ex2,-2:ex3) - const size_t nx = (size_t)ex1 + 3; - const size_t ny = (size_t)ex2 + 3; - const size_t nz = (size_t)ex3 + 3; - const size_t fh_size = nx * ny * nz; + const double d2dx = ONE/TWO/dX, d2dy = ONE/TWO/dY, d2dz = ONE/TWO/dZ; - double *fh = (double*)malloc(fh_size * sizeof(double)); - if (!fh) return; + /* ---- advection (2nd-order) ---- */ + for (int k0 = 0; k0 <= ex3-2; ++k0) { + const int kF = k0+1; + for (int j0 = 0; j0 <= ex2-2; ++j0) { + const int jF = j0+1; + for (int i0 = 0; i0 <= ex1-2; ++i0) { + const int iF = i0+1; + const size_t p = idx_ex(i0,j0,k0,ex); - symmetry_bd(3, ex, f, fh, SoA); - - // Advection (same stencil logic as lopsided_c.C) - for (int k0 = 0; k0 <= ex3 - 2; ++k0) { - const int kF = k0 + 1; - for (int j0 = 0; j0 <= ex2 - 2; ++j0) { - const int jF = j0 + 1; - for (int i0 = 0; i0 <= ex1 - 2; ++i0) { - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); - - const double sfx = Sfx[p]; - if (sfx > ZEO) { - if (i0 <= ex1 - 4) { - f_rhs[p] += sfx * d12dx * - (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] - + fh[idx_fh_F(iF + 3, jF, kF, ex)]); - } else if (i0 <= ex1 - 3) { - f_rhs[p] += sfx * d12dx * - ( fh[idx_fh_F(iF - 2, jF, kF, ex)] - -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] - +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] - - fh[idx_fh_F(iF + 2, jF, kF, ex)]); - } else if (i0 <= ex1 - 2) { - f_rhs[p] -= sfx * d12dx * - (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] - + fh[idx_fh_F(iF - 3, jF, kF, ex)]); + const double sfx = Sfx[p]; + if (sfx > ZEO) { + if (i0<=ex1-3) f_rhs[p] += sfx*d2dx*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord2(iF+2,jF,kF,ex)]); + else if (i0<=ex1-2) f_rhs[p] += sfx*d2dx*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+1,jF,kF,ex)]); + } else if (sfx < ZEO) { + if ((i0-1)>=iminF) f_rhs[p] -= sfx*d2dx*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]-fh[idx_fh_F_ord2(iF-2,jF,kF,ex)]); + else if (i0>=iminF) f_rhs[p] -= sfx*d2dx*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]); } - } else if (sfx < ZEO) { - if ((i0 - 2) >= iminF) { - f_rhs[p] -= sfx * d12dx * - (-F3 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF - 2, jF, kF, ex)] - + fh[idx_fh_F(iF - 3, jF, kF, ex)]); - } else if ((i0 - 1) >= iminF) { - f_rhs[p] += sfx * d12dx * - ( fh[idx_fh_F(iF - 2, jF, kF, ex)] - -EIT * fh[idx_fh_F(iF - 1, jF, kF, ex)] - +EIT * fh[idx_fh_F(iF + 1, jF, kF, ex)] - - fh[idx_fh_F(iF + 2, jF, kF, ex)]); - } else if (i0 >= iminF) { - f_rhs[p] += sfx * d12dx * - (-F3 * fh[idx_fh_F(iF - 1, jF, kF, ex)] - -F10 * fh[idx_fh_F(iF , jF, kF, ex)] - +F18 * fh[idx_fh_F(iF + 1, jF, kF, ex)] - -F6 * fh[idx_fh_F(iF + 2, jF, kF, ex)] - + fh[idx_fh_F(iF + 3, jF, kF, ex)]); + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0<=ex2-3) f_rhs[p] += sfy*d2dy*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord2(iF,jF+2,kF,ex)]); + else if (j0<=ex2-2) f_rhs[p] += sfy*d2dy*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+1,kF,ex)]); + } else if (sfy < ZEO) { + if ((j0-1)>=jminF) f_rhs[p] -= sfy*d2dy*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]-fh[idx_fh_F_ord2(iF,jF-2,kF,ex)]); + else if (j0>=jminF) f_rhs[p] -= sfy*d2dy*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]); } - } - - const double sfy = Sfy[p]; - if (sfy > ZEO) { - if (j0 <= ex2 - 4) { - f_rhs[p] += sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] - + fh[idx_fh_F(iF, jF + 3, kF, ex)]); - } else if (j0 <= ex2 - 3) { - f_rhs[p] += sfy * d12dy * - ( fh[idx_fh_F(iF, jF - 2, kF, ex)] - -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] - +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] - - fh[idx_fh_F(iF, jF + 2, kF, ex)]); - } else if (j0 <= ex2 - 2) { - f_rhs[p] -= sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] - + fh[idx_fh_F(iF, jF - 3, kF, ex)]); - } - } else if (sfy < ZEO) { - if ((j0 - 2) >= jminF) { - f_rhs[p] -= sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF - 2, kF, ex)] - + fh[idx_fh_F(iF, jF - 3, kF, ex)]); - } else if ((j0 - 1) >= jminF) { - f_rhs[p] += sfy * d12dy * - ( fh[idx_fh_F(iF, jF - 2, kF, ex)] - -EIT * fh[idx_fh_F(iF, jF - 1, kF, ex)] - +EIT * fh[idx_fh_F(iF, jF + 1, kF, ex)] - - fh[idx_fh_F(iF, jF + 2, kF, ex)]); - } else if (j0 >= jminF) { - f_rhs[p] += sfy * d12dy * - (-F3 * fh[idx_fh_F(iF, jF - 1, kF, ex)] - -F10 * fh[idx_fh_F(iF, jF , kF, ex)] - +F18 * fh[idx_fh_F(iF, jF + 1, kF, ex)] - -F6 * fh[idx_fh_F(iF, jF + 2, kF, ex)] - + fh[idx_fh_F(iF, jF + 3, kF, ex)]); - } - } - - const double sfz = Sfz[p]; - if (sfz > ZEO) { - if (k0 <= ex3 - 4) { - f_rhs[p] += sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] - + fh[idx_fh_F(iF, jF, kF + 3, ex)]); - } else if (k0 <= ex3 - 3) { - f_rhs[p] += sfz * d12dz * - ( fh[idx_fh_F(iF, jF, kF - 2, ex)] - -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] - +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] - - fh[idx_fh_F(iF, jF, kF + 2, ex)]); - } else if (k0 <= ex3 - 2) { - f_rhs[p] -= sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] - + fh[idx_fh_F(iF, jF, kF - 3, ex)]); - } - } else if (sfz < ZEO) { - if ((k0 - 2) >= kminF) { - f_rhs[p] -= sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF - 2, ex)] - + fh[idx_fh_F(iF, jF, kF - 3, ex)]); - } else if ((k0 - 1) >= kminF) { - f_rhs[p] += sfz * d12dz * - ( fh[idx_fh_F(iF, jF, kF - 2, ex)] - -EIT * fh[idx_fh_F(iF, jF, kF - 1, ex)] - +EIT * fh[idx_fh_F(iF, jF, kF + 1, ex)] - - fh[idx_fh_F(iF, jF, kF + 2, ex)]); - } else if (k0 >= kminF) { - f_rhs[p] += sfz * d12dz * - (-F3 * fh[idx_fh_F(iF, jF, kF - 1, ex)] - -F10 * fh[idx_fh_F(iF, jF, kF , ex)] - +F18 * fh[idx_fh_F(iF, jF, kF + 1, ex)] - -F6 * fh[idx_fh_F(iF, jF, kF + 2, ex)] - + fh[idx_fh_F(iF, jF, kF + 3, ex)]); + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0<=ex3-3) f_rhs[p] += sfz*d2dz*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord2(iF,jF,kF+2,ex)]); + else if (k0<=ex3-2) f_rhs[p] += sfz*d2dz*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+1,ex)]); + } else if (sfz < ZEO) { + if ((k0-1)>=kminF) f_rhs[p] -= sfz*d2dz*(-F3*fh[idx_fh_F_ord2(iF,jF,kF,ex)]+F4*fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]-fh[idx_fh_F_ord2(iF,jF,kF-2,ex)]); + else if (k0>=kminF) f_rhs[p] -= sfz*d2dz*(-fh[idx_fh_F_ord2(iF,jF,kF,ex)]+fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]); } } } } + + /* ---- KO dissipation (r=2, cof=16, sign=-) ---- */ + if (eps > ZEO) { + const double cof = 16.0; + const double F4k = 4.0, F6k = 6.0; + const int i0_lo = (iminF+1>0)?iminF+1:0, j0_lo=(jminF+1>0)?jminF+1:0, k0_lo=(kminF+1>0)?kminF+1:0; + const int i0_hi=imaxF-3, j0_hi=jmaxF-3, k0_hi=kmaxF-3; + if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) { + for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+1; + for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+1; + for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+1; + const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_F_ord2(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+2,jF,kF,ex)])-F4k*(fh[idx_fh_F_ord2(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord2(iF+1,jF,kF,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dX; + const double Dy=((fh[idx_fh_F_ord2(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+2,kF,ex)])-F4k*(fh[idx_fh_F_ord2(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord2(iF,jF+1,kF,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dY; + const double Dz=((fh[idx_fh_F_ord2(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+2,ex)])-F4k*(fh[idx_fh_F_ord2(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord2(iF,jF,kF+1,ex)])+F6k*fh[idx_fh_F_ord2(iF,jF,kF,ex)])/dZ; + f_rhs[p] -= (eps/cof)*(Dx+Dy+Dz); + }}} + } + } + free(fh); + return; } +#elif (ghost_width == 3) + /* ---- 4th-order advection + r=3 KO (original code) ----------------- */ + { + const int ord = 3; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -2; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -2; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -2; - // KO dissipation (same domain restriction as kodiss_c.C) - if (eps > ZEO) { - const int i0_lo = (iminF + 2 > 0) ? iminF + 2 : 0; - const int j0_lo = (jminF + 2 > 0) ? jminF + 2 : 0; - const int k0_lo = (kminF + 2 > 0) ? kminF + 2 : 0; - const int i0_hi = imaxF - 4; // inclusive - const int j0_hi = jmaxF - 4; - const int k0_hi = kmaxF - 4; + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + double *fh = (double*)malloc(nx*ny*nz*sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); - if (!(i0_lo > i0_hi || j0_lo > j0_hi || k0_lo > k0_hi)) { - for (int k0 = k0_lo; k0 <= k0_hi; ++k0) { - const int kF = k0 + 1; - for (int j0 = j0_lo; j0 <= j0_hi; ++j0) { - const int jF = j0 + 1; - for (int i0 = i0_lo; i0 <= i0_hi; ++i0) { - const int iF = i0 + 1; - const size_t p = idx_ex(i0, j0, k0, ex); + const double d12dx = ONE/F12/dX, d12dy = ONE/F12/dY, d12dz = ONE/F12/dZ; - const double Dx_term = - ((fh[idx_fh_F(iF - 3, jF, kF, ex)] + fh[idx_fh_F(iF + 3, jF, kF, ex)]) - - SIX * (fh[idx_fh_F(iF - 2, jF, kF, ex)] + fh[idx_fh_F(iF + 2, jF, kF, ex)]) + - FIT * (fh[idx_fh_F(iF - 1, jF, kF, ex)] + fh[idx_fh_F(iF + 1, jF, kF, ex)]) - - TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dX; + /* ---- advection ---- */ + for (int k0 = 0; k0 <= ex3-2; ++k0) { + const int kF = k0+2; + for (int j0 = 0; j0 <= ex2-2; ++j0) { + const int jF = j0+2; + for (int i0 = 0; i0 <= ex1-2; ++i0) { + const int iF = i0+2; + const size_t p = idx_ex(i0,j0,k0,ex); - const double Dy_term = - ((fh[idx_fh_F(iF, jF - 3, kF, ex)] + fh[idx_fh_F(iF, jF + 3, kF, ex)]) - - SIX * (fh[idx_fh_F(iF, jF - 2, kF, ex)] + fh[idx_fh_F(iF, jF + 2, kF, ex)]) + - FIT * (fh[idx_fh_F(iF, jF - 1, kF, ex)] + fh[idx_fh_F(iF, jF + 1, kF, ex)]) - - TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dY; - - const double Dz_term = - ((fh[idx_fh_F(iF, jF, kF - 3, ex)] + fh[idx_fh_F(iF, jF, kF + 3, ex)]) - - SIX * (fh[idx_fh_F(iF, jF, kF - 2, ex)] + fh[idx_fh_F(iF, jF, kF + 2, ex)]) + - FIT * (fh[idx_fh_F(iF, jF, kF - 1, ex)] + fh[idx_fh_F(iF, jF, kF + 1, ex)]) - - TWT * fh[idx_fh_F(iF, jF, kF, ex)]) / dZ; - - f_rhs[p] += (eps / cof) * (Dx_term + Dy_term + Dz_term); + const double sfx = Sfx[p]; + if (sfx > ZEO) { + if (i0 <= ex1-4) + f_rhs[p] += sfx*d12dx*(-F3*fh[idx_fh_F(iF-1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF+1,jF,kF,ex)]-F6*fh[idx_fh_F(iF+2,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)]); + else if (i0 <= ex1-3) + f_rhs[p] += sfx*d12dx*(fh[idx_fh_F(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F(iF+1,jF,kF,ex)]-fh[idx_fh_F(iF+2,jF,kF,ex)]); + else if (i0 <= ex1-2) + f_rhs[p] -= sfx*d12dx*(-F3*fh[idx_fh_F(iF+1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF-1,jF,kF,ex)]-F6*fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF-3,jF,kF,ex)]); + } else if (sfx < ZEO) { + if ((i0-2) >= iminF) + f_rhs[p] -= sfx*d12dx*(-F3*fh[idx_fh_F(iF+1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF-1,jF,kF,ex)]-F6*fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF-3,jF,kF,ex)]); + else if ((i0-1) >= iminF) + f_rhs[p] += sfx*d12dx*(fh[idx_fh_F(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F(iF+1,jF,kF,ex)]-fh[idx_fh_F(iF+2,jF,kF,ex)]); + else if (i0 >= iminF) + f_rhs[p] += sfx*d12dx*(-F3*fh[idx_fh_F(iF-1,jF,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF+1,jF,kF,ex)]-F6*fh[idx_fh_F(iF+2,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)]); + } + const double sfy = Sfy[p]; + if (sfy > ZEO) { + if (j0<=ex2-4) f_rhs[p] += sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)]); + else if (j0<=ex2-3) f_rhs[p] += sfy*d12dy*(fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]); + else if (j0<=ex2-2) f_rhs[p] -= sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF-3,kF,ex)]); + } else if (sfy < ZEO) { + if ((j0-2)>=jminF) f_rhs[p] -= sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF+1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF-1,kF,ex)]-F6*fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF-3,kF,ex)]); + else if ((j0-1)>=jminF) f_rhs[p] += sfy*d12dy*(fh[idx_fh_F(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F(iF,jF+1,kF,ex)]-fh[idx_fh_F(iF,jF+2,kF,ex)]); + else if (j0>=jminF) f_rhs[p] += sfy*d12dy*(-F3*fh[idx_fh_F(iF,jF-1,kF,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF+1,kF,ex)]-F6*fh[idx_fh_F(iF,jF+2,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)]); + } + const double sfz = Sfz[p]; + if (sfz > ZEO) { + if (k0<=ex3-4) f_rhs[p] += sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)]); + else if (k0<=ex3-3) f_rhs[p] += sfz*d12dz*(fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]); + else if (k0<=ex3-2) f_rhs[p] -= sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF-3,ex)]); + } else if (sfz < ZEO) { + if ((k0-2)>=kminF) f_rhs[p] -= sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF+1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF-1,ex)]-F6*fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF-3,ex)]); + else if ((k0-1)>=kminF) f_rhs[p] += sfz*d12dz*(fh[idx_fh_F(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F(iF,jF,kF+1,ex)]-fh[idx_fh_F(iF,jF,kF+2,ex)]); + else if (k0>=kminF) f_rhs[p] += sfz*d12dz*(-F3*fh[idx_fh_F(iF,jF,kF-1,ex)]-F10*fh[idx_fh_F(iF,jF,kF,ex)]+F18*fh[idx_fh_F(iF,jF,kF+1,ex)]-F6*fh[idx_fh_F(iF,jF,kF+2,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)]); } } } } - } - free(fh); + /* ---- KO dissipation (r=3, cof=64, sign=+) ---- */ + if (eps > ZEO) { + const double cof = 64.0; + const double SIX = 6.0, FIT = 15.0, TWT = 20.0; + const int i0_lo=(iminF+2>0)?iminF+2:0, j0_lo=(jminF+2>0)?jminF+2:0, k0_lo=(kminF+2>0)?kminF+2:0; + const int i0_hi=imaxF-4, j0_hi=jmaxF-4, k0_hi=kmaxF-4; + if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) { + for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+2; + for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+2; + for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+2; + const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_F(iF-3,jF,kF,ex)]+fh[idx_fh_F(iF+3,jF,kF,ex)])-SIX*(fh[idx_fh_F(iF-2,jF,kF,ex)]+fh[idx_fh_F(iF+2,jF,kF,ex)])+FIT*(fh[idx_fh_F(iF-1,jF,kF,ex)]+fh[idx_fh_F(iF+1,jF,kF,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dX; + const double Dy=((fh[idx_fh_F(iF,jF-3,kF,ex)]+fh[idx_fh_F(iF,jF+3,kF,ex)])-SIX*(fh[idx_fh_F(iF,jF-2,kF,ex)]+fh[idx_fh_F(iF,jF+2,kF,ex)])+FIT*(fh[idx_fh_F(iF,jF-1,kF,ex)]+fh[idx_fh_F(iF,jF+1,kF,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dY; + const double Dz=((fh[idx_fh_F(iF,jF,kF-3,ex)]+fh[idx_fh_F(iF,jF,kF+3,ex)])-SIX*(fh[idx_fh_F(iF,jF,kF-2,ex)]+fh[idx_fh_F(iF,jF,kF+2,ex)])+FIT*(fh[idx_fh_F(iF,jF,kF-1,ex)]+fh[idx_fh_F(iF,jF,kF+1,ex)])-TWT*fh[idx_fh_F(iF,jF,kF,ex)])/dZ; + f_rhs[p] += (eps/cof)*(Dx+Dy+Dz); + }}} + } + } + free(fh); + return; + } +#elif (ghost_width == 4) + { + const int ord = 4; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -3; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -3; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -3; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + double *fh = (double*)malloc(nx*ny*nz*sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); + + const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ; + const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ; + const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ; + + /* ---- advection (6th-order lopsided) ---- */ + for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+3; + for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+3; + for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+3; + const size_t p=idx_ex(i0,j0,k0,ex); + /* x */ + const double sfx=Sfx[p]; + if (sfx>ZEO) { + if (i0<=ex1-5&&(i0-1)>=iminF) f_rhs[p]+=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]); + else if (i0<=ex1-6&&i0>=iminF) f_rhs[p]+=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF+4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF+5,jF,kF,ex)]); + else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]); + else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]); + else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]); + } else if (sfx=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(+F2*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]-F24*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F30*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]); + else if ((i0-5)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-F10*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]-F100*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+F50*fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]-F15*fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+F2*fh[idx_fh_F_ord4(iF-5,jF,kF,ex)]); + else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)]); + else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord4(iF+2,jF,kF,ex)]); + else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)]); + } + /* y */ + const double sfy=Sfy[p]; + if (sfy>ZEO) { + if (j0<=ex2-5&&(j0-1)>=jminF) f_rhs[p]+=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]); + else if (j0<=ex2-6&&j0>=jminF) f_rhs[p]+=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF+4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF+5,kF,ex)]); + else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]); + else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]); + else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]); + } else if (sfy=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(F2*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]-F24*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F30*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]); + else if ((j0-5)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-F10*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]-F100*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+F50*fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]-F15*fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+F2*fh[idx_fh_F_ord4(iF,jF-5,kF,ex)]); + else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)]); + else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord4(iF,jF+2,kF,ex)]); + else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)]); + } + /* z */ + const double sfz=Sfz[p]; + if (sfz>ZEO) { + if (k0<=ex3-5&&(k0-1)>=kminF) f_rhs[p]+=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]); + else if (k0<=ex3-6&&k0>=kminF) f_rhs[p]+=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF+4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF+5,ex)]); + else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]); + else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]); + else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]); + } else if (sfz=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(F2*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]-F24*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F35*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F80*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F30*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]); + else if ((k0-5)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-F10*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F77*fh[idx_fh_F_ord4(iF,jF,kF,ex)]+F150*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]-F100*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+F50*fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]-F15*fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+F2*fh[idx_fh_F_ord4(iF,jF,kF-5,ex)]); + else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)]); + else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord4(iF,jF,kF+2,ex)]); + else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)]); + } + }}} + + /* ---- KO dissipation (r=4, cof=256, sign=-) ---- */ + if (eps > ZEO) { + const double cof = 256.0; + const double F8k = 8.0, F28 = 28.0, F56 = 56.0, F70 = 70.0; + const int i0_lo=(iminF+3>0)?iminF+3:0, j0_lo=(jminF+3>0)?jminF+3:0, k0_lo=(kminF+3>0)?kminF+3:0; + const int i0_hi=imaxF-5, j0_hi=jmaxF-5, k0_hi=kmaxF-5; + if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) { + for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+3; + for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+3; + for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+3; + const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_F_ord4(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+4,jF,kF,ex)])-F8k*(fh[idx_fh_F_ord4(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+3,jF,kF,ex)])+F28*(fh[idx_fh_F_ord4(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+2,jF,kF,ex)])-F56*(fh[idx_fh_F_ord4(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord4(iF+1,jF,kF,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dX; + const double Dy=((fh[idx_fh_F_ord4(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+4,kF,ex)])-F8k*(fh[idx_fh_F_ord4(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+3,kF,ex)])+F28*(fh[idx_fh_F_ord4(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+2,kF,ex)])-F56*(fh[idx_fh_F_ord4(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord4(iF,jF+1,kF,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dY; + const double Dz=((fh[idx_fh_F_ord4(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+4,ex)])-F8k*(fh[idx_fh_F_ord4(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+3,ex)])+F28*(fh[idx_fh_F_ord4(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+2,ex)])-F56*(fh[idx_fh_F_ord4(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord4(iF,jF,kF+1,ex)])+F70*fh[idx_fh_F_ord4(iF,jF,kF,ex)])/dZ; + f_rhs[p] -= (eps/cof)*(Dx+Dy+Dz); + }}} + } + } + free(fh); + return; + } +#elif (ghost_width == 5) + { + const int ord = 5; + int iminF = 1, jminF = 1, kminF = 1; + if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) kminF = -4; + if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) iminF = -4; + if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) jminF = -4; + + const size_t nx = (size_t)ex1 + ord; + const size_t ny = (size_t)ex2 + ord; + const size_t nz = (size_t)ex3 + ord; + double *fh = (double*)malloc(nx*ny*nz*sizeof(double)); + if (!fh) return; + symmetry_bd(ord, ex, f, fh, SoA); + + const double d840dx=ONE/F840/dX, d840dy=ONE/F840/dY, d840dz=ONE/F840/dZ; + const double d60dx=ONE/F60/dX, d60dy=ONE/F60/dY, d60dz=ONE/F60/dZ; + const double d12dx=ONE/F12/dX, d12dy=ONE/F12/dY, d12dz=ONE/F12/dZ; + const double d2dx=ONE/TWO/dX, d2dy=ONE/TWO/dY, d2dz=ONE/TWO/dZ; + + /* ---- advection (8th-order lopsided) ---- */ + for (int k0=0;k0<=ex3-2;++k0) { const int kF=k0+4; + for (int j0=0;j0<=ex2-2;++j0) { const int jF=j0+4; + for (int i0=0;i0<=ex1-2;++i0) { const int iF=i0+4; + const size_t p=idx_ex(i0,j0,k0,ex); + const double sfx=Sfx[p]; + if (sfx>ZEO) { + if (i0<=ex1-6&&(i0-2)>=iminF) f_rhs[p]+=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF+5,jF,kF,ex)]); + else if (i0<=ex1-5&&(i0-3)>=iminF) f_rhs[p]+=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]); + else if (i0<=ex1-4&&(i0-2)>=iminF) f_rhs[p]+=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]); + else if (i0<=ex1-3&&(i0-1)>=iminF) f_rhs[p]+=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]); + else if (i0<=ex1-2&&i0>=iminF) f_rhs[p]+=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]); + } else if (sfx=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(-F5*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]+F60*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]-F420*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+F140*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]-F30*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+F3*fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]); + else if ((i0-4)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d840dx*(+F3*fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]-F32*fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F168*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F672*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F672*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F168*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+F32*fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]-F3*fh[idx_fh_F_ord5(iF+4,jF,kF,ex)]); + else if ((i0-3)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d60dx*(-fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+F9*fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-F45*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+F45*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-F9*fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)]); + else if ((i0-2)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d12dx*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]-fh[idx_fh_F_ord5(iF+2,jF,kF,ex)]); + else if ((i0-1)>=iminF&&i0<=ex1-2) f_rhs[p]-=sfx*d2dx*(-fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)]); + } + const double sfy=Sfy[p]; + if (sfy>ZEO) { + if (j0<=ex2-6&&(j0-2)>=jminF) f_rhs[p]+=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF+5,kF,ex)]); + else if (j0<=ex2-5&&(j0-3)>=jminF) f_rhs[p]+=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]); + else if (j0<=ex2-4&&(j0-2)>=jminF) f_rhs[p]+=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]); + else if (j0<=ex2-3&&(j0-1)>=jminF) f_rhs[p]+=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]); + else if (j0<=ex2-2&&j0>=jminF) f_rhs[p]+=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]); + } else if (sfy=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(-F5*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]+F60*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]-F420*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+F140*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]-F30*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+F3*fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]); + else if ((j0-4)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d840dy*(+F3*fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]-F32*fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F168*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F672*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F672*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F168*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+F32*fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]-F3*fh[idx_fh_F_ord5(iF,jF+4,kF,ex)]); + else if ((j0-3)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d60dy*(-fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+F9*fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-F45*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+F45*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-F9*fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)]); + else if ((j0-2)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d12dy*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]-fh[idx_fh_F_ord5(iF,jF+2,kF,ex)]); + else if ((j0-1)>=jminF&&j0<=ex2-2) f_rhs[p]-=sfy*d2dy*(-fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)]); + } + const double sfz=Sfz[p]; + if (sfz>ZEO) { + if (k0<=ex3-6&&(k0-2)>=kminF) f_rhs[p]+=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF+5,ex)]); + else if (k0<=ex3-5&&(k0-3)>=kminF) f_rhs[p]+=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]); + else if (k0<=ex3-4&&(k0-2)>=kminF) f_rhs[p]+=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]); + else if (k0<=ex3-3&&(k0-1)>=kminF) f_rhs[p]+=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]); + else if (k0<=ex3-2&&k0>=kminF) f_rhs[p]+=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]); + } else if (sfz=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(-F5*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]+F60*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F378*fh[idx_fh_F_ord5(iF,jF,kF,ex)]+F1050*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]-F420*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+F140*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]-F30*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+F3*fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]); + else if ((k0-4)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d840dz*(+F3*fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]-F32*fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F168*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F672*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F672*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F168*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+F32*fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]-F3*fh[idx_fh_F_ord5(iF,jF,kF+4,ex)]); + else if ((k0-3)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d60dz*(-fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+F9*fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-F45*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+F45*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-F9*fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)]); + else if ((k0-2)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d12dz*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]-EIT*fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+EIT*fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]-fh[idx_fh_F_ord5(iF,jF,kF+2,ex)]); + else if ((k0-1)>=kminF&&k0<=ex3-2) f_rhs[p]-=sfz*d2dz*(-fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)]); + } + }}} + + /* ---- KO dissipation (r=5, cof=1024, sign=+) ---- */ + if (eps > ZEO) { + const double cof = 1024.0; + const double F10k=10.0, F45k=45.0, F120=120.0, F210=210.0, F252=252.0; + const int i0_lo=(iminF+4>0)?iminF+4:0, j0_lo=(jminF+4>0)?jminF+4:0, k0_lo=(kminF+4>0)?kminF+4:0; + const int i0_hi=imaxF-6, j0_hi=jmaxF-6, k0_hi=kmaxF-6; + if (!(i0_lo>i0_hi||j0_lo>j0_hi||k0_lo>k0_hi)) { + for (int k0=k0_lo;k0<=k0_hi;++k0) { const int kF=k0+4; + for (int j0=j0_lo;j0<=j0_hi;++j0) { const int jF=j0+4; + for (int i0=i0_lo;i0<=i0_hi;++i0) { const int iF=i0+4; + const size_t p=idx_ex(i0,j0,k0,ex); + const double Dx=((fh[idx_fh_F_ord5(iF-5,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+5,jF,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF-4,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+4,jF,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF-3,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+3,jF,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF-2,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+2,jF,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF-1,jF,kF,ex)]+fh[idx_fh_F_ord5(iF+1,jF,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dX; + const double Dy=((fh[idx_fh_F_ord5(iF,jF-5,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+5,kF,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF-4,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+4,kF,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF-3,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+3,kF,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF-2,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+2,kF,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF-1,kF,ex)]+fh[idx_fh_F_ord5(iF,jF+1,kF,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dY; + const double Dz=((fh[idx_fh_F_ord5(iF,jF,kF-5,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+5,ex)])-F10k*(fh[idx_fh_F_ord5(iF,jF,kF-4,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+4,ex)])+F45k*(fh[idx_fh_F_ord5(iF,jF,kF-3,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+3,ex)])-F120*(fh[idx_fh_F_ord5(iF,jF,kF-2,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+2,ex)])+F210*(fh[idx_fh_F_ord5(iF,jF,kF-1,ex)]+fh[idx_fh_F_ord5(iF,jF,kF+1,ex)])-F252*fh[idx_fh_F_ord5(iF,jF,kF,ex)])/dZ; + f_rhs[p] += (eps/cof)*(Dx+Dy+Dz); + }}} + } + } + free(fh); + return; + } +#else +#error "lopsided_kodis_c.C: unsupported ghost_width (must be 2, 3, 4, or 5)" +#endif } diff --git a/AMSS_NCKU_source/share_func.h b/AMSS_NCKU_source/share_func.h index 2b509b8..0fe2630 100644 --- a/AMSS_NCKU_source/share_func.h +++ b/AMSS_NCKU_source/share_func.h @@ -46,6 +46,45 @@ static inline size_t idx_fh_F(int iF, int jF, int kF, const int ex[3]) { return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; } +/* + * fh 对应 Fortran: fh(0:ex1, 0:ex2, 0:ex3) + * ord=1 => shift=0 + * iF/jF/kF 为 Fortran 索引 (0..ex) + */ +static inline size_t idx_fh_F_ord1(int iF, int jF, int kF, const int ex[3]) { + const int nx = ex[0] + 1; // ex1 + ord + const int ny = ex[1] + 1; + return (size_t)iF + (size_t)jF * (size_t)nx + (size_t)kF * (size_t)nx * (size_t)ny; +} + +/* + * fh 对应 Fortran: fh(-3:ex1, -3:ex2, -3:ex3) + * ord=4 => shift=3 + */ +static inline size_t idx_fh_F_ord4(int iF, int jF, int kF, const int ex[3]) { + const int shift = 3; + const int nx = ex[0] + 4; // ex1 + ord + const int ny = ex[1] + 4; + const int ii = iF + shift; // 0..ex1+3 + const int jj = jF + shift; // 0..ex2+3 + const int kk = kF + shift; // 0..ex3+3 + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + +/* + * fh 对应 Fortran: fh(-4:ex1, -4:ex2, -4:ex3) + * ord=5 => shift=4 + */ +static inline size_t idx_fh_F_ord5(int iF, int jF, int kF, const int ex[3]) { + const int shift = 4; + const int nx = ex[0] + 5; // ex1 + ord + const int ny = ex[1] + 5; + const int ii = iF + shift; // 0..ex1+4 + const int jj = jF + shift; // 0..ex2+4 + const int kk = kF + shift; // 0..ex3+4 + return (size_t)ii + (size_t)jj * (size_t)nx + (size_t)kk * (size_t)nx * (size_t)ny; +} + /* * func: (1..extc1, 1..extc2, 1..extc3) 1-based in Fortran * funcc: (-ord+1..extc1, -ord+1..extc2, -ord+1..extc3) in Fortran @@ -231,7 +270,10 @@ static inline void symmetry_bd(int ord, { if (ord <= 0) return; - /* Fast paths used by current C kernels: ord=2 (derivs), ord=3 (lopsided/KO). */ + if (ord == 1) { + symmetry_bd_impl(1, 0, extc, func, funcc, SoA); + return; + } if (ord == 2) { symmetry_bd_impl(2, 1, extc, func, funcc, SoA); return; @@ -240,6 +282,10 @@ static inline void symmetry_bd(int ord, symmetry_bd_impl(3, 2, extc, func, funcc, SoA); return; } + if (ord == 4) { + symmetry_bd_impl(4, 3, extc, func, funcc, SoA); + return; + } symmetry_bd_impl(ord, ord - 1, extc, func, funcc, SoA); }