diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index e1ec962..a090775 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "macrodef.h" #include "bssn_rhs.h" @@ -196,6 +197,33 @@ __device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) { return (iF + 2) + (jF + 2) * d_gp.fh3_nx + (kF + 2) * d_gp.fh3_nx * d_gp.fh3_ny; } +__device__ __forceinline__ double fetch_sym_ord3_direct(const double *src, + int iF, int jF, int kF, + int SoA0, int SoA1, int SoA2) +{ + int siF = iF; + int sjF = jF; + int skF = kF; + double sign = 1.0; + + if (iF <= 0) { + siF = 1 - iF; + sign *= (double)SoA0; + } + if (jF <= 0) { + sjF = 1 - jF; + sign *= (double)SoA1; + } + if (kF <= 0) { + skF = 1 - kF; + sign *= (double)SoA2; + } + + return sign * src[(siF - 1) + + (sjF - 1) * d_gp.ex[0] + + (skF - 1) * d_gp.ex[0] * d_gp.ex[1]]; +} + /* ------------------------------------------------------------------ */ /* GPU buffer management */ /* ------------------------------------------------------------------ */ @@ -290,6 +318,7 @@ static const int STAGE_SLOT_COUNT = static constexpr int BSSN_STATE_COUNT = 24; static constexpr int BSSN_MATTER_COUNT = 10; +static constexpr int BSSN_LK_FIELD_COUNT = 24; static const int k_state_input_slots[BSSN_STATE_COUNT] = { S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz, @@ -312,6 +341,52 @@ static const int k_matter_slots[BSSN_MATTER_COUNT] = { S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz }; +static const int k_lk_adv_slots[BSSN_LK_FIELD_COUNT] = { + S_gxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_gyy, S_betay, + S_gyz, S_betaz, S_gzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz, + S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy +}; + +static const int k_lk_ko_slots[BSSN_LK_FIELD_COUNT] = { + S_dxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_dyy, S_betay, + S_gyz, S_betaz, S_dzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz, + S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy +}; + +static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { + S_gxx_rhs, S_Gamz_rhs, S_gxy_rhs, S_Lap_rhs, S_gxz_rhs, S_betax_rhs, + S_gyy_rhs, S_betay_rhs, S_gyz_rhs, S_betaz_rhs, S_gzz_rhs, S_dtSfx_rhs, + S_Axx_rhs, S_dtSfy_rhs, S_Axy_rhs, S_dtSfz_rhs, S_Axz_rhs, S_Ayy_rhs, + S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs +}; + +static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { + 1, 1, 1, + 1, 1, -1, + -1, -1, 1, + 1, 1, 1, + -1, 1, -1, + -1, 1, 1, + 1, 1, 1, + 1, -1, 1, + 1, -1, -1, + 1, 1, -1, + 1, 1, 1, + -1, 1, 1, + 1, 1, 1, + 1, -1, 1, + -1, -1, 1, + 1, 1, -1, + -1, 1, -1, + 1, 1, 1, + 1, -1, -1, + 1, 1, 1, + 1, 1, 1, + 1, 1, 1, + -1, 1, 1, + 1, -1, 1 +}; + struct StepContext { double *d_state0_mem; double *d_accum_mem; @@ -326,9 +401,101 @@ struct StepContext { size_t cap_all; bool matter_ready; bool state_ready; + + StepContext() + : d_state0_mem(nullptr), d_accum_mem(nullptr), + d_state_curr_mem(nullptr), d_state_next_mem(nullptr), + d_matter_mem(nullptr), cap_all(0), + matter_ready(false), state_ready(false) + { + d_state0.fill(nullptr); + d_accum.fill(nullptr); + d_state_curr.fill(nullptr); + d_state_next.fill(nullptr); + d_matter.fill(nullptr); + } +}; + +struct StepAllocation { + double *d_state0_mem; + double *d_accum_mem; + double *d_state_curr_mem; + double *d_state_next_mem; + double *d_matter_mem; + size_t cap_all; }; static std::unordered_map g_step_ctx; +static std::vector g_step_pool; + +static StepAllocation empty_step_allocation() +{ + StepAllocation alloc = {nullptr, nullptr, nullptr, nullptr, nullptr, 0}; + return alloc; +} + +static bool has_step_allocation(const StepAllocation &alloc) +{ + return alloc.cap_all != 0; +} + +static StepAllocation detach_step_allocation(StepContext &ctx) +{ + StepAllocation alloc = { + ctx.d_state0_mem, ctx.d_accum_mem, ctx.d_state_curr_mem, + ctx.d_state_next_mem, ctx.d_matter_mem, ctx.cap_all + }; + ctx.d_state0_mem = nullptr; + ctx.d_accum_mem = nullptr; + ctx.d_state_curr_mem = nullptr; + ctx.d_state_next_mem = nullptr; + ctx.d_matter_mem = nullptr; + ctx.cap_all = 0; + ctx.matter_ready = false; + ctx.state_ready = false; + ctx.d_state0.fill(nullptr); + ctx.d_accum.fill(nullptr); + ctx.d_state_curr.fill(nullptr); + ctx.d_state_next.fill(nullptr); + ctx.d_matter.fill(nullptr); + return alloc; +} + +static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc) +{ + ctx.d_state0_mem = alloc.d_state0_mem; + ctx.d_accum_mem = alloc.d_accum_mem; + ctx.d_state_curr_mem = alloc.d_state_curr_mem; + ctx.d_state_next_mem = alloc.d_state_next_mem; + ctx.d_matter_mem = alloc.d_matter_mem; + ctx.cap_all = alloc.cap_all; + ctx.matter_ready = false; + ctx.state_ready = false; +} + +static void recycle_step_allocation(StepAllocation &alloc) +{ + if (!has_step_allocation(alloc)) return; + g_step_pool.push_back(alloc); + alloc = empty_step_allocation(); +} + +static StepAllocation acquire_step_allocation(size_t all) +{ + size_t best = g_step_pool.size(); + for (size_t i = 0; i < g_step_pool.size(); ++i) { + if (g_step_pool[i].cap_all < all) continue; + if (best == g_step_pool.size() || g_step_pool[i].cap_all < g_step_pool[best].cap_all) + best = i; + } + if (best == g_step_pool.size()) + return empty_step_allocation(); + + StepAllocation alloc = g_step_pool[best]; + g_step_pool[best] = g_step_pool.back(); + g_step_pool.pop_back(); + return alloc; +} static void ensure_gpu_buffers(int nx, int ny, int nz) { size_t all = (size_t)nx * ny * nz; @@ -385,34 +552,19 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all) { StepContext &ctx = g_step_ctx[block_tag]; if (ctx.cap_all < all) { - if (ctx.d_state0_mem) { - cudaFree(ctx.d_state0_mem); - ctx.d_state0_mem = nullptr; + StepAllocation old_alloc = detach_step_allocation(ctx); + recycle_step_allocation(old_alloc); + + StepAllocation alloc = acquire_step_allocation(all); + if (!has_step_allocation(alloc)) { + CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double))); + CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); + alloc.cap_all = all; } - if (ctx.d_accum_mem) { - cudaFree(ctx.d_accum_mem); - ctx.d_accum_mem = nullptr; - } - if (ctx.d_state_curr_mem) { - cudaFree(ctx.d_state_curr_mem); - ctx.d_state_curr_mem = nullptr; - } - if (ctx.d_state_next_mem) { - cudaFree(ctx.d_state_next_mem); - ctx.d_state_next_mem = nullptr; - } - if (ctx.d_matter_mem) { - cudaFree(ctx.d_matter_mem); - ctx.d_matter_mem = nullptr; - } - CUDA_CHECK(cudaMalloc(&ctx.d_state0_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&ctx.d_accum_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&ctx.d_state_curr_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&ctx.d_state_next_mem, BSSN_STATE_COUNT * all * sizeof(double))); - CUDA_CHECK(cudaMalloc(&ctx.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double))); - ctx.cap_all = all; - ctx.matter_ready = false; - ctx.state_ready = false; + attach_step_allocation(ctx, alloc); } for (int i = 0; i < BSSN_STATE_COUNT; ++i) { ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all; @@ -430,11 +582,8 @@ static void release_step_ctx(void *block_tag) { auto it = g_step_ctx.find(block_tag); if (it == g_step_ctx.end()) return; - if (it->second.d_state0_mem) cudaFree(it->second.d_state0_mem); - if (it->second.d_accum_mem) cudaFree(it->second.d_accum_mem); - if (it->second.d_state_curr_mem) cudaFree(it->second.d_state_curr_mem); - if (it->second.d_state_next_mem) cudaFree(it->second.d_state_next_mem); - if (it->second.d_matter_mem) cudaFree(it->second.d_matter_mem); + StepAllocation alloc = detach_step_allocation(it->second); + recycle_step_allocation(alloc); g_step_ctx.erase(it); } @@ -1042,6 +1191,15 @@ void kern_kodis(const double * __restrict__ fh, /* ================================================================== */ /* Host wrapper helpers */ /* ================================================================== */ +__constant__ const double *d_lk_adv_fields[BSSN_LK_FIELD_COUNT]; +__constant__ const double *d_lk_ko_fields[BSSN_LK_FIELD_COUNT]; +__constant__ double *d_lk_rhs_fields[BSSN_LK_FIELD_COUNT]; +__constant__ int d_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT]; + +__constant__ const double *d_rk4_f0_fields[BSSN_STATE_COUNT]; +__constant__ double *d_rk4_rhs_fields[BSSN_STATE_COUNT]; +__constant__ double *d_rk4_accum_fields[BSSN_STATE_COUNT]; + static const int BLK = 128; static inline int grid(size_t n) { if (n == 0) return 1; @@ -1106,6 +1264,239 @@ static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs, } } +__global__ __launch_bounds__(128, 4) +void kern_lopsided_kodis_batched(const double * __restrict__ Sfx, + const double * __restrict__ Sfy, + const double * __restrict__ Sfz, + double eps_val) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int field = blockIdx.y; + const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2]; + const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3; + const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF; + const int SoA0 = d_lk_soa_signs[3 * field + 0]; + const int SoA1 = d_lk_soa_signs[3 * field + 1]; + const int SoA2 = d_lk_soa_signs[3 * field + 2]; + const double *adv_src = d_lk_adv_fields[field]; + const double *ko_src = d_lk_ko_fields[field]; + double *rhs = d_lk_rhs_fields[field]; + + const int i0 = tid % nx; + const int j0 = (tid / nx) % ny; + const int k0 = tid / (nx * ny); + const int iF = i0 + 1; + const int jF = j0 + 1; + const int kF = k0 + 1; + + if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) { + double val = 0.0; + + const double sfx = Sfx[tid]; + if (sfx > 0.0) { + if (i0 <= nx - 4) { + val += sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2)); + } else if (i0 <= nx - 3) { + val += sfx * d_gp.d12dx * ( + fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + } else { + val -= sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)); + } + } else if (sfx < 0.0) { + if ((i0 - 2) >= iminF) { + val -= sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)); + } else if ((i0 - 1) >= iminF) { + val += sfx * d_gp.d12dx * ( + fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)); + } else if (i0 >= iminF) { + val += sfx * d_gp.d12dx * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2)); + } + } + + const double sfy = Sfy[tid]; + if (sfy > 0.0) { + if (j0 <= ny - 4) { + val += sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2)); + } else if (j0 <= ny - 3) { + val += sfy * d_gp.d12dy * ( + fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + } else { + val -= sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)); + } + } else if (sfy < 0.0) { + if ((j0 - 2) >= jminF) { + val -= sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)); + } else if ((j0 - 1) >= jminF) { + val += sfy * d_gp.d12dy * ( + fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)); + } else if (j0 >= jminF) { + val += sfy * d_gp.d12dy * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2)); + } + } + + const double sfz = Sfz[tid]; + if (sfz > 0.0) { + if (k0 <= nz - 4) { + val += sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2)); + } else if (k0 <= nz - 3) { + val += sfz * d_gp.d12dz * ( + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + } else { + val -= sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)); + } + } else if (sfz < 0.0) { + if ((k0 - 2) >= kminF) { + val -= sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)); + } else if ((k0 - 1) >= kminF) { + val += sfz * d_gp.d12dz * ( + fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + - 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)); + } else if (k0 >= kminF) { + val += sfz * d_gp.d12dz * ( + -3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + -10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2) + +18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2) + - 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2)); + } + } + + rhs[tid] += val; + } + + if (eps_val > 0.0 && + (iF - 3) >= iminF && (iF + 3) <= imaxF && + (jF - 3) >= jminF && (jF + 3) <= jmaxF && + (kF - 3) >= kminF && (kF + 3) <= kmaxF) + { + const double cof = 64.0; + const double Dx = + (fetch_sym_ord3_direct(ko_src, iF - 3, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF + 3, jF, kF, SoA0, SoA1, SoA2)) + - 6.0 * (fetch_sym_ord3_direct(ko_src, iF - 2, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)) + + 15.0 * (fetch_sym_ord3_direct(ko_src, iF - 1, jF, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)) + - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2); + + const double Dy = + (fetch_sym_ord3_direct(ko_src, iF, jF - 3, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF + 3, kF, SoA0, SoA1, SoA2)) + - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 2, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)) + + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 1, kF, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)) + - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2); + + const double Dz = + (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 3, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 3, SoA0, SoA1, SoA2)) + - 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 2, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)) + + 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 1, SoA0, SoA1, SoA2) + + fetch_sym_ord3_direct(ko_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)) + - 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2); + + rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ); + } +} + +static void gpu_lopsided_kodis_state_batch(double eps_val, int all) +{ + const double *adv_fields[BSSN_LK_FIELD_COUNT]; + const double *ko_fields[BSSN_LK_FIELD_COUNT]; + double *rhs_fields[BSSN_LK_FIELD_COUNT]; + + for (int i = 0; i < BSSN_LK_FIELD_COUNT; ++i) { + adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]]; + ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]]; + rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]]; + } + + CUDA_CHECK(cudaMemcpyToSymbol(d_lk_adv_fields, adv_fields, sizeof(adv_fields))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lk_ko_fields, ko_fields, sizeof(ko_fields))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lk_rhs_fields, rhs_fields, sizeof(rhs_fields))); + CUDA_CHECK(cudaMemcpyToSymbol(d_lk_soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs))); + + dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_LK_FIELD_COUNT); + kern_lopsided_kodis_batched<<>>( + g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], eps_val); +} + __global__ void kern_rk4_finalize(const double * __restrict__ f0, double * __restrict__ frhs, double * __restrict__ accum, @@ -1137,6 +1528,63 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0, } } +__global__ __launch_bounds__(128, 4) +void kern_rk4_finalize_batched(double dT, int rk4_stage, double chitiny) +{ + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= d_gp.all) return; + + const int field = blockIdx.y; + const double *f0 = d_rk4_f0_fields[field]; + double *frhs = d_rk4_rhs_fields[field]; + double *accum = d_rk4_accum_fields[field]; + + const double rhs = frhs[tid]; + switch (rk4_stage) { + case 0: + accum[tid] = rhs; + frhs[tid] = f0[tid] + 0.5 * dT * rhs; + break; + case 1: + accum[tid] += 2.0 * rhs; + frhs[tid] = f0[tid] + 0.5 * dT * rhs; + break; + case 2: + accum[tid] += 2.0 * rhs; + frhs[tid] = f0[tid] + dT * rhs; + break; + default: + frhs[tid] = f0[tid] + (dT / 6.0) * (accum[tid] + rhs); + break; + } + + if (field == 0 && frhs[tid] < chitiny) frhs[tid] = chitiny; +} + +static void gpu_rk4_finalize_batch(const StepContext &ctx, + size_t all, + double dT, + int rk4_stage, + double chitiny) +{ + const double *f0_fields[BSSN_STATE_COUNT]; + double *rhs_fields[BSSN_STATE_COUNT]; + double *accum_fields[BSSN_STATE_COUNT]; + + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + f0_fields[i] = ctx.d_state0[i]; + rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]]; + accum_fields[i] = ctx.d_accum[i]; + } + + CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_f0_fields, f0_fields, sizeof(f0_fields))); + CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_rhs_fields, rhs_fields, sizeof(rhs_fields))); + CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_accum_fields, accum_fields, sizeof(accum_fields))); + + dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT); + kern_rk4_finalize_batched<<>>(dT, rk4_stage, chitiny); +} + __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx, double * __restrict__ gxy, double * __restrict__ gxz, @@ -2862,30 +3310,7 @@ static void launch_rhs_pipeline(int all, double eps, int co) D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs), D(S_f_arr), D(S_S_arr)); - gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); - gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + gpu_lopsided_kodis_state_batch(eps, all); if (co == 0) { gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all); @@ -3395,30 +3820,7 @@ int f_compute_rhs_bssn(int *ex, double &T, /* ============================================================ */ /* Phase 16/17: advection + KO dissipation (shared ord=3 pack) */ /* ============================================================ */ - gpu_lopsided_kodis(D(S_gxx), D(S_dxx), D(S_gxx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamz), D(S_Gamz), D(S_Gamz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_gxy), D(S_gxy), D(S_gxy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_Lap), D(S_Lap), D(S_Lap_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_gxz), D(S_gxz), D(S_gxz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_betax), D(S_betax), D(S_betax_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_gyy), D(S_dyy), D(S_gyy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_betay), D(S_betay), D(S_betay_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_gyz), D(S_gyz), D(S_gyz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); - gpu_lopsided_kodis(D(S_betaz), D(S_betaz), D(S_betaz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_gzz), D(S_dzz), D(S_gzz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfx), D(S_dtSfx), D(S_dtSfx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Axx), D(S_Axx), D(S_Axx_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfy), D(S_dtSfy), D(S_dtSfy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_Axy), D(S_Axy), D(S_Axy_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,ANTI,SYM, eps, all); - gpu_lopsided_kodis(D(S_dtSfz), D(S_dtSfz), D(S_dtSfz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Axz), D(S_Axz), D(S_Axz_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Ayy), D(S_Ayy), D(S_Ayy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Ayz), D(S_Ayz), D(S_Ayz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,ANTI, eps, all); - gpu_lopsided_kodis(D(S_Azz), D(S_Azz), D(S_Azz_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_chi), D(S_chi), D(S_chi_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_trK), D(S_trK), D(S_trK_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamx), D(S_Gamx), D(S_Gamx_rhs), D(S_betax),D(S_betay),D(S_betaz), ANTI,SYM,SYM, eps, all); - gpu_lopsided_kodis(D(S_Gamy), D(S_Gamy), D(S_Gamy_rhs), D(S_betax),D(S_betay),D(S_betaz), SYM,ANTI,SYM, eps, all); + gpu_lopsided_kodis_state_batch(eps, all); /* ============================================================ */ /* Phase 18: Hamilton & momentum constraints (co==0) */ @@ -3604,15 +4006,7 @@ int bssn_cuda_rk4_substep(void *block_tag, } t0 = profile ? cuda_profile_now_ms() : 0.0; - for (int i = 0; i < BSSN_STATE_COUNT; ++i) { - kern_rk4_finalize<<>>(ctx.d_state0[i], - g_buf.slot[k_state_rhs_slots[i]], - ctx.d_accum[i], - dT, - RK4); - } - - kern_lowerboundset_cuda<<>>(g_buf.slot[S_chi_rhs], chitiny); + gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny); if (profile) { cuda_profile_sync(); finalize_ms += cuda_profile_now_ms() - t0;