10382 lines
464 KiB
Plaintext
10382 lines
464 KiB
Plaintext
/*
|
|
* bssn_rhs_cuda.cu — GPU implementation of f_compute_rhs_bssn
|
|
*
|
|
* Drop-in replacement for bssn_rhs_c.C.
|
|
* Compile with nvcc, link bssn_rhs_cuda.o in place of bssn_rhs_c.o.
|
|
*/
|
|
|
|
#include <array>
|
|
#include <chrono>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
#include <cuda_runtime.h>
|
|
#include "macrodef.h"
|
|
#include "bssn_rhs.h"
|
|
|
|
extern "C" {
|
|
#ifdef fortran1
|
|
void set_escalar_parameter(double &, double &, double &, double &, double &);
|
|
#endif
|
|
#ifdef fortran2
|
|
void SET_ESCALAR_PARAMETER(double &, double &, double &, double &, double &);
|
|
#endif
|
|
#ifdef fortran3
|
|
void set_escalar_parameter_(double &, double &, double &, double &, double &);
|
|
#endif
|
|
}
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Multi-GPU dispatch: distribute ranks across available GPUs */
|
|
/* ------------------------------------------------------------------ */
|
|
static struct {
|
|
int num_gpus;
|
|
int my_rank;
|
|
int my_local_rank;
|
|
int my_device;
|
|
bool inited;
|
|
} g_dispatch = {0, -1, -1, -1, false};
|
|
|
|
static int env_to_int(const char *name, int fallback = -1) {
|
|
const char *v = getenv(name);
|
|
if (!v || !*v) return fallback;
|
|
return atoi(v);
|
|
}
|
|
|
|
static void init_gpu_dispatch() {
|
|
if (g_dispatch.inited) return;
|
|
cudaError_t err = cudaGetDeviceCount(&g_dispatch.num_gpus);
|
|
if (err != cudaSuccess) g_dispatch.num_gpus = 1;
|
|
if (g_dispatch.num_gpus < 1) g_dispatch.num_gpus = 1;
|
|
|
|
/* Get MPI rank from environment (set by mpirun/mpiexec). */
|
|
g_dispatch.my_rank = env_to_int("PMI_RANK",
|
|
env_to_int("OMPI_COMM_WORLD_RANK",
|
|
env_to_int("MV2_COMM_WORLD_RANK",
|
|
env_to_int("SLURM_PROCID", 0))));
|
|
|
|
/* Prefer local rank for per-node GPU mapping (avoids cross-node skew). */
|
|
g_dispatch.my_local_rank = env_to_int("OMPI_COMM_WORLD_LOCAL_RANK",
|
|
env_to_int("MV2_COMM_WORLD_LOCAL_RANK",
|
|
env_to_int("MPI_LOCALRANKID",
|
|
env_to_int("SLURM_LOCALID", -1))));
|
|
|
|
const int rank_for_map = (g_dispatch.my_local_rank >= 0)
|
|
? g_dispatch.my_local_rank : g_dispatch.my_rank;
|
|
g_dispatch.my_device = rank_for_map % g_dispatch.num_gpus;
|
|
cudaSetDevice(g_dispatch.my_device);
|
|
|
|
if (g_dispatch.my_rank == 0) {
|
|
printf("[AMSS-GPU] %d GPU(s) detected, device map uses %s rank\n",
|
|
g_dispatch.num_gpus,
|
|
(g_dispatch.my_local_rank >= 0) ? "local" : "global");
|
|
}
|
|
g_dispatch.inited = true;
|
|
}
|
|
|
|
struct CudaProfileStats {
|
|
long long calls;
|
|
double total_ms;
|
|
double state_ms;
|
|
double matter_ms;
|
|
double rhs_ms;
|
|
double bc_ms;
|
|
double finalize_ms;
|
|
double output_ms;
|
|
long long upload_calls;
|
|
long long resident_download_calls;
|
|
double upload_ms;
|
|
double resident_download_ms;
|
|
double upload_gb;
|
|
double resident_download_gb;
|
|
};
|
|
|
|
enum RhsStageId {
|
|
RHS_STAGE_PREP = 0,
|
|
RHS_STAGE_DERIV1,
|
|
RHS_STAGE_METRIC,
|
|
RHS_STAGE_GAUGE_DERIV,
|
|
RHS_STAGE_GAMMA_CONTRACT,
|
|
RHS_STAGE_RICCI_DIFF,
|
|
RHS_STAGE_RICCI_FUSED,
|
|
RHS_STAGE_CHI,
|
|
RHS_STAGE_GAUGE_RHS,
|
|
RHS_STAGE_KODIS,
|
|
RHS_STAGE_CONSTRAINTS,
|
|
RHS_STAGE_COUNT
|
|
};
|
|
|
|
struct RhsStageProfileStats {
|
|
long long calls;
|
|
double ms[RHS_STAGE_COUNT];
|
|
};
|
|
|
|
struct CudaAuxProfileStats {
|
|
long long prepare_calls;
|
|
long long writeback_calls;
|
|
double prepare_ms;
|
|
double writeback_ms;
|
|
double writeback_gb;
|
|
};
|
|
|
|
static CudaProfileStats &cuda_profile_stats() {
|
|
static CudaProfileStats stats = {
|
|
0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
|
|
0, 0, 0.0, 0.0, 0.0, 0.0
|
|
};
|
|
return stats;
|
|
}
|
|
|
|
static CudaAuxProfileStats &cuda_aux_profile_stats() {
|
|
static CudaAuxProfileStats stats = {};
|
|
return stats;
|
|
}
|
|
|
|
static bool cuda_aux_profile_enabled() {
|
|
static int enabled = -1;
|
|
if (enabled < 0) {
|
|
const char *env = getenv("AMSS_PROFILE_CUDA_AUX");
|
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
|
}
|
|
return enabled != 0;
|
|
}
|
|
|
|
static int cuda_aux_profile_every() {
|
|
static int every = -1;
|
|
if (every < 0) {
|
|
const char *env = getenv("AMSS_PROFILE_CUDA_AUX_EVERY");
|
|
every = (env && atoi(env) > 0) ? atoi(env) : 100;
|
|
}
|
|
return every;
|
|
}
|
|
|
|
static void cuda_aux_profile_maybe_log() {
|
|
if (!cuda_aux_profile_enabled()) return;
|
|
CudaAuxProfileStats &stats = cuda_aux_profile_stats();
|
|
const long long calls = stats.prepare_calls + stats.writeback_calls;
|
|
if (calls <= 0 || calls % cuda_aux_profile_every() != 0) return;
|
|
fprintf(stderr,
|
|
"[AMSS-CUDA-AUX][rank %d][dev %d] prepare=%lld avg_prepare=%.3f ms writebacks=%lld avg_writeback=%.3f ms writeback_GB=%.3f\n",
|
|
g_dispatch.my_rank, g_dispatch.my_device,
|
|
stats.prepare_calls,
|
|
stats.prepare_calls ? stats.prepare_ms / (double)stats.prepare_calls : 0.0,
|
|
stats.writeback_calls,
|
|
stats.writeback_calls ? stats.writeback_ms / (double)stats.writeback_calls : 0.0,
|
|
stats.writeback_gb);
|
|
fflush(stderr);
|
|
}
|
|
|
|
static RhsStageProfileStats &rhs_stage_profile_stats() {
|
|
static RhsStageProfileStats stats = {};
|
|
return stats;
|
|
}
|
|
|
|
static bool cuda_profile_enabled() {
|
|
static int enabled = -1;
|
|
if (enabled < 0) {
|
|
const char *env = getenv("AMSS_PROFILE_CUDA");
|
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
|
}
|
|
return enabled != 0;
|
|
}
|
|
|
|
static int cuda_profile_every() {
|
|
static int every = -1;
|
|
if (every < 0) {
|
|
const char *env = getenv("AMSS_PROFILE_CUDA_EVERY");
|
|
every = (env && atoi(env) > 0) ? atoi(env) : 100;
|
|
}
|
|
return every;
|
|
}
|
|
|
|
static bool rhs_stage_timing_enabled() {
|
|
static int enabled = -1;
|
|
if (enabled < 0) {
|
|
const char *env = getenv("AMSS_GPU_STAGE_TIMING");
|
|
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
|
}
|
|
return enabled != 0;
|
|
}
|
|
|
|
static int rhs_stage_timing_every() {
|
|
static int every = -1;
|
|
if (every < 0) {
|
|
const char *env = getenv("AMSS_GPU_STAGE_TIMING_EVERY");
|
|
every = (env && atoi(env) > 0) ? atoi(env) : cuda_profile_every();
|
|
}
|
|
return every;
|
|
}
|
|
|
|
static double cuda_profile_now_ms() {
|
|
using clock = std::chrono::steady_clock;
|
|
return std::chrono::duration<double, std::milli>(
|
|
clock::now().time_since_epoch()).count();
|
|
}
|
|
|
|
static void cuda_profile_sync() {
|
|
cudaError_t err = cudaDeviceSynchronize();
|
|
if (err != cudaSuccess) {
|
|
fprintf(stderr, "CUDA error %s:%d: %s\n",
|
|
__FILE__, __LINE__, cudaGetErrorString(err));
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
static void cuda_profile_maybe_log() {
|
|
if (!cuda_profile_enabled()) return;
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
if (stats.calls <= 0 || stats.calls % cuda_profile_every() != 0) return;
|
|
fprintf(stderr,
|
|
"[AMSS-CUDA][rank %d][dev %d] calls=%lld avg_total=%.3f ms avg_state=%.3f ms avg_matter=%.3f ms avg_rhs=%.3f ms avg_bc=%.3f ms avg_finalize=%.3f ms avg_output=%.3f ms"
|
|
" uploads=%lld avg_upload=%.3f ms upload_GB=%.3f resident_downloads=%lld avg_resident_download=%.3f ms resident_download_GB=%.3f\n",
|
|
g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
|
|
stats.total_ms / (double)stats.calls,
|
|
stats.state_ms / (double)stats.calls,
|
|
stats.matter_ms / (double)stats.calls,
|
|
stats.rhs_ms / (double)stats.calls,
|
|
stats.bc_ms / (double)stats.calls,
|
|
stats.finalize_ms / (double)stats.calls,
|
|
stats.output_ms / (double)stats.calls,
|
|
stats.upload_calls,
|
|
stats.upload_calls ? stats.upload_ms / (double)stats.upload_calls : 0.0,
|
|
stats.upload_gb,
|
|
stats.resident_download_calls,
|
|
stats.resident_download_calls ? stats.resident_download_ms / (double)stats.resident_download_calls : 0.0,
|
|
stats.resident_download_gb);
|
|
fflush(stderr);
|
|
}
|
|
|
|
static void rhs_stage_profile_accumulate(const double *stage_ms) {
|
|
if (!rhs_stage_timing_enabled()) return;
|
|
|
|
RhsStageProfileStats &stats = rhs_stage_profile_stats();
|
|
stats.calls++;
|
|
for (int i = 0; i < RHS_STAGE_COUNT; ++i) {
|
|
stats.ms[i] += stage_ms[i];
|
|
}
|
|
if (stats.calls <= 0 || stats.calls % rhs_stage_timing_every() != 0) return;
|
|
|
|
fprintf(stderr,
|
|
"[AMSS-CUDA-STAGE][rank %d][dev %d] calls=%lld"
|
|
" prep=%.3f deriv1=%.3f metric=%.3f gauge_deriv=%.3f"
|
|
" gamma_contract=%.3f ricci_diff=%.3f ricci_fused=%.3f"
|
|
" chi=%.3f gauge_rhs=%.3f kodis=%.3f constraints=%.3f ms\n",
|
|
g_dispatch.my_rank, g_dispatch.my_device, stats.calls,
|
|
stats.ms[RHS_STAGE_PREP] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_DERIV1] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_METRIC] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_GAUGE_DERIV] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_GAMMA_CONTRACT] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_RICCI_DIFF] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_RICCI_FUSED] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_CHI] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_GAUGE_RHS] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_KODIS] / (double)stats.calls,
|
|
stats.ms[RHS_STAGE_CONSTRAINTS] / (double)stats.calls);
|
|
fflush(stderr);
|
|
}
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Error checking */
|
|
/* ------------------------------------------------------------------ */
|
|
#define CUDA_CHECK(call) do { \
|
|
cudaError_t err = (call); \
|
|
if (err != cudaSuccess) { \
|
|
fprintf(stderr, "CUDA error %s:%d: %s\n", \
|
|
__FILE__, __LINE__, cudaGetErrorString(err)); \
|
|
exit(EXIT_FAILURE); \
|
|
} \
|
|
} while(0)
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Physical / gauge constants (matching bssn_rhs_c.C) */
|
|
/* ------------------------------------------------------------------ */
|
|
static const double PI_VAL = 3.14159265358979323846;
|
|
static const double FF_VAL = 0.75;
|
|
static const double ETA_VAL = 2.0;
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Constant memory for grid parameters and stencil coefficients */
|
|
/* ------------------------------------------------------------------ */
|
|
struct GridParams {
|
|
int ex[3]; /* nx, ny, nz */
|
|
int all; /* nx*ny*nz */
|
|
double dX, dY, dZ;
|
|
/* fderivs coefficients */
|
|
double d12dx, d12dy, d12dz; /* 1/(12*dX) etc */
|
|
double d2dx, d2dy, d2dz; /* 1/(2*dX) etc */
|
|
/* fdderivs coefficients */
|
|
double Fdxdx, Fdydy, Fdzdz; /* 1/(12*dX^2) etc */
|
|
double Sdxdx, Sdydy, Sdzdz; /* 1/(dX^2) etc */
|
|
double Fdxdy, Fdxdz, Fdydz; /* 1/(144*dX*dY) etc */
|
|
double Sdxdy, Sdxdz, Sdydz; /* 1/(4*dX*dY) etc */
|
|
/* symmetry bounds (Fortran 1-based) */
|
|
int iminF, jminF, kminF;
|
|
int imaxF, jmaxF, kmaxF;
|
|
/* symmetry bounds for ord=3 (lopsided/kodis) */
|
|
int iminF3, jminF3, kminF3;
|
|
int Symmetry;
|
|
double eps;
|
|
int co;
|
|
/* padded sizes */
|
|
int fh2_nx, fh2_ny, fh2_nz; /* (nx+2), (ny+2), (nz+2) for ord=2 */
|
|
int fh3_nx, fh3_ny, fh3_nz; /* (nx+3), (ny+3), (nz+3) for ord=3 */
|
|
};
|
|
|
|
__constant__ GridParams d_gp;
|
|
static GridParams g_gp_host_cache = {};
|
|
static bool g_gp_host_cache_valid = false;
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* Device indexing helpers */
|
|
/* ------------------------------------------------------------------ */
|
|
__device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) {
|
|
return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1];
|
|
}
|
|
|
|
__device__ __forceinline__ double fetch_sym_ord2_direct(const double *src,
|
|
int iF, int jF, int kF,
|
|
int SoA0, int SoA1, int SoA2)
|
|
{
|
|
int siF = iF;
|
|
int sjF = jF;
|
|
int skF = kF;
|
|
double sign = 1.0;
|
|
|
|
if (iF <= 0) {
|
|
siF = 1 - iF;
|
|
sign *= (double)SoA0;
|
|
}
|
|
if (jF <= 0) {
|
|
sjF = 1 - jF;
|
|
sign *= (double)SoA1;
|
|
}
|
|
if (kF <= 0) {
|
|
skF = 1 - kF;
|
|
sign *= (double)SoA2;
|
|
}
|
|
|
|
return sign * src[(siF - 1)
|
|
+ (sjF - 1) * d_gp.ex[0]
|
|
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
|
}
|
|
|
|
/* ord=2 ghost-padded: Fortran index iF -> flat index */
|
|
__device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) {
|
|
return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny;
|
|
}
|
|
|
|
/* ord=3 ghost-padded: Fortran index iF -> flat index */
|
|
__device__ __forceinline__ int idx_fh3(int iF, int jF, int kF) {
|
|
return (iF + 2) + (jF + 2) * d_gp.fh3_nx + (kF + 2) * d_gp.fh3_nx * d_gp.fh3_ny;
|
|
}
|
|
|
|
__device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
|
|
int iF, int jF, int kF,
|
|
int SoA0, int SoA1, int SoA2)
|
|
{
|
|
int siF = iF;
|
|
int sjF = jF;
|
|
int skF = kF;
|
|
double sign = 1.0;
|
|
|
|
if (iF <= 0) {
|
|
siF = 1 - iF;
|
|
sign *= (double)SoA0;
|
|
}
|
|
if (jF <= 0) {
|
|
sjF = 1 - jF;
|
|
sign *= (double)SoA1;
|
|
}
|
|
if (kF <= 0) {
|
|
skF = 1 - kF;
|
|
sign *= (double)SoA2;
|
|
}
|
|
|
|
return sign * src[(siF - 1)
|
|
+ (sjF - 1) * d_gp.ex[0]
|
|
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
|
}
|
|
|
|
#include "fd_cuda_helpers.cuh"
|
|
|
|
/* ------------------------------------------------------------------ */
|
|
/* GPU buffer management */
|
|
/* ------------------------------------------------------------------ */
|
|
/*
|
|
* Array slot indices — all arrays live in one big cudaMalloc block.
|
|
* INPUT arrays (H2D): 39 slots
|
|
* OUTPUT arrays (D2H): 52 slots
|
|
* TEMPORARY arrays (GPU-only): ~65 slots
|
|
* Plus 2 extended arrays for ghost-padded stencils (fh_ord2, fh_ord3)
|
|
*/
|
|
|
|
/* Total number of "all"-sized slots */
|
|
#define NUM_SLOTS 256
|
|
|
|
struct GpuBuffers {
|
|
double *d_mem; /* single big allocation */
|
|
double *d_fh2; /* ghost-padded ord=2: (nx+2)*(ny+2)*(nz+2) */
|
|
double *d_fh3; /* ghost-padded ord=3: (nx+3)*(ny+3)*(nz+3) */
|
|
double *h_stage; /* host staging buffer for bulk H2D/D2H */
|
|
bool h_stage_pinned; /* true if allocated by cudaMallocHost */
|
|
double *slot[NUM_SLOTS]; /* pointers into d_mem */
|
|
size_t cap_all;
|
|
size_t cap_fh2_size;
|
|
size_t cap_fh3_size;
|
|
int prev_nx, prev_ny, prev_nz;
|
|
bool initialized;
|
|
};
|
|
|
|
static GpuBuffers g_buf = {
|
|
nullptr, nullptr, nullptr, nullptr, false, {},
|
|
0, 0, 0, 0, 0, 0, false
|
|
};
|
|
|
|
/* Slot assignments — INPUT (H2D) */
|
|
enum {
|
|
S_chi=0, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
|
|
S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
|
|
S_Gamx, S_Gamy, S_Gamz,
|
|
S_Lap, S_betax, S_betay, S_betaz,
|
|
S_dtSfx, S_dtSfy, S_dtSfz,
|
|
S_rho, S_Sx, S_Sy, S_Sz,
|
|
S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz,
|
|
S_X, S_Y, S_Z, /* coordinate arrays — only nx/ny/nz long */
|
|
/* 37 input slots so far; X/Y/Z are special-sized */
|
|
|
|
/* OUTPUT (D2H) */
|
|
S_chi_rhs, S_trK_rhs,
|
|
S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
|
|
S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
|
|
S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
|
|
S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
|
|
S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
|
|
S_Gamxxx, S_Gamxxy, S_Gamxxz, S_Gamxyy, S_Gamxyz, S_Gamxzz,
|
|
S_Gamyxx, S_Gamyxy, S_Gamyxz, S_Gamyyy, S_Gamyyz, S_Gamyzz,
|
|
S_Gamzxx, S_Gamzxy, S_Gamzxz, S_Gamzyy, S_Gamzyz, S_Gamzzz,
|
|
S_Rxx, S_Rxy, S_Rxz, S_Ryy, S_Ryz, S_Rzz,
|
|
S_ham_Res, S_movx_Res, S_movy_Res, S_movz_Res,
|
|
S_Gmx_Res, S_Gmy_Res, S_Gmz_Res,
|
|
|
|
/* TEMPORARY (GPU-only) */
|
|
S_gxx, S_gyy, S_gzz, /* physical metric = dxx+1 etc */
|
|
S_alpn1, S_chin1,
|
|
S_chix, S_chiy, S_chiz,
|
|
S_gxxx, S_gxyx, S_gxzx, S_gyyx, S_gyzx, S_gzzx,
|
|
S_gxxy, S_gxyy, S_gxzy, S_gyyy, S_gyzy, S_gzzy,
|
|
S_gxxz, S_gxyz, S_gxzz, S_gyyz, S_gyzz, S_gzzz,
|
|
S_Lapx, S_Lapy, S_Lapz,
|
|
S_betaxx, S_betaxy, S_betaxz,
|
|
S_betayx, S_betayy, S_betayz,
|
|
S_betazx, S_betazy, S_betazz,
|
|
S_Gamxx, S_Gamxy, S_Gamxz,
|
|
S_Gamyx, S_Gamyy_t, S_Gamyz_t,
|
|
S_Gamzx, S_Gamzy, S_Gamzz_t,
|
|
S_Kx, S_Ky, S_Kz,
|
|
S_S_arr, S_f_arr,
|
|
S_fxx, S_fxy, S_fxz, S_fyy, S_fyz, S_fzz,
|
|
S_Gamxa, S_Gamya, S_Gamza,
|
|
S_gupxx, S_gupxy, S_gupxz,
|
|
S_gupyy, S_gupyz, S_gupzz,
|
|
S_Sphi, S_Spi, S_Sphi_rhs, S_Spi_rhs,
|
|
S_Sphi0, S_Spi0, S_Sphi_accum, S_Spi_accum,
|
|
S_Sphi_next, S_Spi_next,
|
|
S_Sphi_x, S_Sphi_y, S_Sphi_z,
|
|
S_Sphi_xx, S_Sphi_xy, S_Sphi_xz, S_Sphi_yy, S_Sphi_yz, S_Sphi_zz,
|
|
S_trK_x, S_trK_y, S_trK_z,
|
|
S_EM_Kpsi, S_EM_Kphi,
|
|
S_EM_Ex, S_EM_Ey, S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz,
|
|
S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar,
|
|
S_EM_Kpsi_rhs, S_EM_Kphi_rhs,
|
|
S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs,
|
|
S_EM_Kpsix, S_EM_Kpsiy, S_EM_Kpsiz,
|
|
S_EM_Kphix, S_EM_Kphiy, S_EM_Kphiz,
|
|
S_EM_Exx, S_EM_Exy, S_EM_Exz,
|
|
S_EM_Eyx, S_EM_Eyy, S_EM_Eyz,
|
|
S_EM_Ezx, S_EM_Ezy, S_EM_Ezz,
|
|
S_EM_Bxx, S_EM_Bxy, S_EM_Bxz,
|
|
S_EM_Byx, S_EM_Byy, S_EM_Byz,
|
|
S_EM_Bzx, S_EM_Bzy, S_EM_Bzz,
|
|
NUM_USED_SLOTS
|
|
};
|
|
|
|
static_assert(NUM_USED_SLOTS <= NUM_SLOTS, "Increase NUM_SLOTS");
|
|
|
|
static const int H2D_INPUT_SLOT_COUNT = (S_Szz - S_chi + 1);
|
|
static const int D2H_BASE_SLOT_COUNT = (S_Rzz - S_chi_rhs + 1);
|
|
static const int D2H_CONSTRAINT_SLOT_COUNT = (S_Gmz_Res - S_ham_Res + 1);
|
|
static const int STAGE_SLOT_COUNT =
|
|
(H2D_INPUT_SLOT_COUNT > (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT))
|
|
? H2D_INPUT_SLOT_COUNT
|
|
: (D2H_BASE_SLOT_COUNT + D2H_CONSTRAINT_SLOT_COUNT);
|
|
|
|
static constexpr int BSSN_STATE_COUNT = 24;
|
|
static constexpr int BSSN_MATTER_COUNT = 10;
|
|
static constexpr int BSSN_LK_FIELD_COUNT = 24;
|
|
static constexpr int BSSN_ESCALAR_LK_FIELD_COUNT = 26;
|
|
static constexpr int BSSN_EM_STATE_COUNT = 32;
|
|
static constexpr int BSSN_EM_SOURCE_COUNT = 4;
|
|
static constexpr int BSSN_EM_LK_FIELD_COUNT = 32;
|
|
static constexpr int BSSN_RESIDENT_BANK_COUNT = 6;
|
|
static constexpr int BSSN_ESCALAR_STATE_COUNT = 26;
|
|
static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_EM_STATE_COUNT;
|
|
|
|
static const int k_state_input_slots[BSSN_STATE_COUNT] = {
|
|
S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
|
|
S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
|
|
S_Gamx, S_Gamy, S_Gamz,
|
|
S_Lap, S_betax, S_betay, S_betaz,
|
|
S_dtSfx, S_dtSfy, S_dtSfz
|
|
};
|
|
|
|
static const int k_state_rhs_slots[BSSN_STATE_COUNT] = {
|
|
S_chi_rhs, S_trK_rhs,
|
|
S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
|
|
S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
|
|
S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
|
|
S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
|
|
S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs
|
|
};
|
|
|
|
static const int k_escalar_state_input_slots[BSSN_ESCALAR_STATE_COUNT] = {
|
|
S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
|
|
S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
|
|
S_Gamx, S_Gamy, S_Gamz,
|
|
S_Lap, S_betax, S_betay, S_betaz,
|
|
S_dtSfx, S_dtSfy, S_dtSfz,
|
|
S_Sphi, S_Spi
|
|
};
|
|
|
|
static const int k_escalar_state_rhs_slots[BSSN_ESCALAR_STATE_COUNT] = {
|
|
S_chi_rhs, S_trK_rhs,
|
|
S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
|
|
S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
|
|
S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
|
|
S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
|
|
S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
|
|
S_Sphi_rhs, S_Spi_rhs
|
|
};
|
|
|
|
static const int k_em_state_input_slots[BSSN_EM_STATE_COUNT] = {
|
|
S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
|
|
S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
|
|
S_Gamx, S_Gamy, S_Gamz,
|
|
S_Lap, S_betax, S_betay, S_betaz,
|
|
S_dtSfx, S_dtSfy, S_dtSfz,
|
|
S_EM_Kpsi, S_EM_Kphi,
|
|
S_EM_Ex, S_EM_Ey, S_EM_Ez,
|
|
S_EM_Bx, S_EM_By, S_EM_Bz
|
|
};
|
|
|
|
static const int k_em_state_rhs_slots[BSSN_EM_STATE_COUNT] = {
|
|
S_chi_rhs, S_trK_rhs,
|
|
S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
|
|
S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
|
|
S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
|
|
S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
|
|
S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
|
|
S_EM_Kpsi_rhs, S_EM_Kphi_rhs,
|
|
S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs,
|
|
S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs
|
|
};
|
|
|
|
static const int k_matter_slots[BSSN_MATTER_COUNT] = {
|
|
S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
|
|
};
|
|
|
|
static const int k_lk_adv_slots[BSSN_LK_FIELD_COUNT] = {
|
|
S_gxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_gyy, S_betay,
|
|
S_gyz, S_betaz, S_gzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz,
|
|
S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy
|
|
};
|
|
|
|
static const int k_lk_ko_slots[BSSN_LK_FIELD_COUNT] = {
|
|
S_dxx, S_Gamz, S_gxy, S_Lap, S_gxz, S_betax, S_dyy, S_betay,
|
|
S_gyz, S_betaz, S_dzz, S_dtSfx, S_Axx, S_dtSfy, S_Axy, S_dtSfz,
|
|
S_Axz, S_Ayy, S_Ayz, S_Azz, S_chi, S_trK, S_Gamx, S_Gamy
|
|
};
|
|
|
|
static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = {
|
|
S_gxx_rhs, S_Gamz_rhs, S_gxy_rhs, S_Lap_rhs, S_gxz_rhs, S_betax_rhs,
|
|
S_gyy_rhs, S_betay_rhs, S_gyz_rhs, S_betaz_rhs, S_gzz_rhs, S_dtSfx_rhs,
|
|
S_Axx_rhs, S_dtSfy_rhs, S_Axy_rhs, S_dtSfz_rhs, S_Axz_rhs, S_Ayy_rhs,
|
|
S_Ayz_rhs, S_Azz_rhs, S_chi_rhs, S_trK_rhs, S_Gamx_rhs, S_Gamy_rhs
|
|
};
|
|
|
|
__constant__ int d_subset_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
|
|
__constant__ double d_comm_state_soa[3 * BSSN_RESIDENT_STATE_CAPACITY];
|
|
|
|
static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = {
|
|
1, 1, 1,
|
|
1, 1, -1,
|
|
-1, -1, 1,
|
|
1, 1, 1,
|
|
-1, 1, -1,
|
|
-1, 1, 1,
|
|
1, 1, 1,
|
|
1, -1, 1,
|
|
1, -1, -1,
|
|
1, 1, -1,
|
|
1, 1, 1,
|
|
-1, 1, 1,
|
|
1, 1, 1,
|
|
1, -1, 1,
|
|
-1, -1, 1,
|
|
1, 1, -1,
|
|
-1, 1, -1,
|
|
1, 1, 1,
|
|
1, -1, -1,
|
|
1, 1, 1,
|
|
1, 1, 1,
|
|
1, 1, 1,
|
|
-1, 1, 1,
|
|
1, -1, 1
|
|
};
|
|
|
|
struct StepContext {
|
|
double *d_state0_mem;
|
|
double *d_accum_mem;
|
|
double *d_state_curr_mem;
|
|
double *d_state_next_mem;
|
|
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
|
|
double *d_matter_mem;
|
|
double *d_em_source_mem;
|
|
double *d_comm_mem;
|
|
double *h_comm_mem;
|
|
std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state0;
|
|
std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_accum;
|
|
std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state_curr;
|
|
std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state_next;
|
|
std::array<std::array<double *, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> d_resident;
|
|
std::array<std::array<double *, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> resident_host;
|
|
std::array<std::array<unsigned char, BSSN_RESIDENT_STATE_CAPACITY>, BSSN_RESIDENT_BANK_COUNT> resident_host_clean;
|
|
std::array<unsigned long long, BSSN_RESIDENT_BANK_COUNT> resident_age;
|
|
std::array<bool, BSSN_RESIDENT_BANK_COUNT> resident_valid;
|
|
std::array<double *, BSSN_MATTER_COUNT> d_matter;
|
|
std::array<double *, BSSN_EM_SOURCE_COUNT> d_em_source;
|
|
std::array<double *, BSSN_EM_SOURCE_COUNT> em_source_host;
|
|
size_t cap_all;
|
|
size_t cap_comm;
|
|
bool h_comm_pinned;
|
|
size_t cap_h_comm;
|
|
bool matter_ready;
|
|
bool em_source_ready;
|
|
bool em_zero_fast_known;
|
|
bool em_zero_fast;
|
|
bool state_ready;
|
|
int current_bank;
|
|
unsigned long long resident_clock;
|
|
|
|
StepContext()
|
|
: d_state0_mem(nullptr), d_accum_mem(nullptr),
|
|
d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
|
|
d_resident_mem{},
|
|
d_matter_mem(nullptr), d_em_source_mem(nullptr),
|
|
d_comm_mem(nullptr), h_comm_mem(nullptr),
|
|
cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
|
|
matter_ready(false), em_source_ready(false),
|
|
em_zero_fast_known(false), em_zero_fast(false), state_ready(false),
|
|
current_bank(-1), resident_clock(0)
|
|
{
|
|
d_resident_mem.fill(nullptr);
|
|
d_state0.fill(nullptr);
|
|
d_accum.fill(nullptr);
|
|
d_state_curr.fill(nullptr);
|
|
d_state_next.fill(nullptr);
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
d_resident[b].fill(nullptr);
|
|
resident_host[b].fill(nullptr);
|
|
resident_host_clean[b].fill(0);
|
|
}
|
|
resident_age.fill(0);
|
|
resident_valid.fill(false);
|
|
d_matter.fill(nullptr);
|
|
d_em_source.fill(nullptr);
|
|
em_source_host.fill(nullptr);
|
|
}
|
|
};
|
|
|
|
struct StepAllocation {
|
|
double *d_state0_mem;
|
|
double *d_accum_mem;
|
|
std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
|
|
double *d_matter_mem;
|
|
double *d_em_source_mem;
|
|
double *d_comm_mem;
|
|
double *h_comm_mem;
|
|
size_t cap_all;
|
|
size_t cap_comm;
|
|
bool h_comm_pinned;
|
|
size_t cap_h_comm;
|
|
};
|
|
|
|
static std::unordered_map<void *, StepContext> g_step_ctx;
|
|
static std::vector<StepAllocation> g_step_pool;
|
|
static int *g_comm_segment_meta = nullptr;
|
|
static size_t g_comm_segment_meta_cap = 0;
|
|
static int *g_em_zero_flag = nullptr;
|
|
|
|
static StepAllocation empty_step_allocation()
|
|
{
|
|
StepAllocation alloc = {};
|
|
alloc.d_state0_mem = nullptr;
|
|
alloc.d_accum_mem = nullptr;
|
|
alloc.d_resident_mem.fill(nullptr);
|
|
alloc.d_matter_mem = nullptr;
|
|
alloc.d_em_source_mem = nullptr;
|
|
alloc.d_comm_mem = nullptr;
|
|
alloc.h_comm_mem = nullptr;
|
|
alloc.cap_all = 0;
|
|
alloc.cap_comm = 0;
|
|
alloc.h_comm_pinned = false;
|
|
alloc.cap_h_comm = 0;
|
|
return alloc;
|
|
}
|
|
|
|
static bool has_step_allocation(const StepAllocation &alloc)
|
|
{
|
|
return alloc.cap_all != 0;
|
|
}
|
|
|
|
static StepAllocation detach_step_allocation(StepContext &ctx)
|
|
{
|
|
StepAllocation alloc = {};
|
|
alloc.d_state0_mem = ctx.d_state0_mem;
|
|
alloc.d_accum_mem = ctx.d_accum_mem;
|
|
alloc.d_resident_mem = ctx.d_resident_mem;
|
|
alloc.d_matter_mem = ctx.d_matter_mem;
|
|
alloc.d_em_source_mem = ctx.d_em_source_mem;
|
|
alloc.d_comm_mem = ctx.d_comm_mem;
|
|
alloc.h_comm_mem = ctx.h_comm_mem;
|
|
alloc.cap_all = ctx.cap_all;
|
|
alloc.cap_comm = ctx.cap_comm;
|
|
alloc.h_comm_pinned = ctx.h_comm_pinned;
|
|
alloc.cap_h_comm = ctx.cap_h_comm;
|
|
ctx.d_state0_mem = nullptr;
|
|
ctx.d_accum_mem = nullptr;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_next_mem = nullptr;
|
|
ctx.d_resident_mem.fill(nullptr);
|
|
ctx.d_matter_mem = nullptr;
|
|
ctx.d_em_source_mem = nullptr;
|
|
ctx.d_comm_mem = nullptr;
|
|
ctx.h_comm_mem = nullptr;
|
|
ctx.cap_all = 0;
|
|
ctx.cap_comm = 0;
|
|
ctx.h_comm_pinned = false;
|
|
ctx.cap_h_comm = 0;
|
|
ctx.matter_ready = false;
|
|
ctx.em_source_ready = false;
|
|
ctx.em_zero_fast_known = false;
|
|
ctx.em_zero_fast = false;
|
|
ctx.state_ready = false;
|
|
ctx.current_bank = -1;
|
|
ctx.resident_clock = 0;
|
|
ctx.d_state0.fill(nullptr);
|
|
ctx.d_accum.fill(nullptr);
|
|
ctx.d_state_curr.fill(nullptr);
|
|
ctx.d_state_next.fill(nullptr);
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
ctx.d_resident[b].fill(nullptr);
|
|
ctx.resident_host[b].fill(nullptr);
|
|
ctx.resident_host_clean[b].fill(0);
|
|
}
|
|
ctx.resident_age.fill(0);
|
|
ctx.resident_valid.fill(false);
|
|
ctx.d_matter.fill(nullptr);
|
|
ctx.d_em_source.fill(nullptr);
|
|
ctx.em_source_host.fill(nullptr);
|
|
return alloc;
|
|
}
|
|
|
|
static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc)
|
|
{
|
|
ctx.d_state0_mem = alloc.d_state0_mem;
|
|
ctx.d_accum_mem = alloc.d_accum_mem;
|
|
ctx.d_resident_mem = alloc.d_resident_mem;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_next_mem = nullptr;
|
|
ctx.d_matter_mem = alloc.d_matter_mem;
|
|
ctx.d_em_source_mem = alloc.d_em_source_mem;
|
|
ctx.d_comm_mem = alloc.d_comm_mem;
|
|
ctx.h_comm_mem = alloc.h_comm_mem;
|
|
ctx.cap_all = alloc.cap_all;
|
|
ctx.cap_comm = alloc.cap_comm;
|
|
ctx.h_comm_pinned = alloc.h_comm_pinned;
|
|
ctx.cap_h_comm = alloc.cap_h_comm;
|
|
ctx.matter_ready = false;
|
|
ctx.em_source_ready = false;
|
|
ctx.em_zero_fast_known = false;
|
|
ctx.em_zero_fast = false;
|
|
ctx.state_ready = false;
|
|
ctx.current_bank = -1;
|
|
ctx.resident_clock = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
ctx.resident_host[b].fill(nullptr);
|
|
ctx.resident_host_clean[b].fill(0);
|
|
}
|
|
ctx.resident_age.fill(0);
|
|
ctx.resident_valid.fill(false);
|
|
ctx.em_source_host.fill(nullptr);
|
|
}
|
|
|
|
static void recycle_step_allocation(StepAllocation &alloc)
|
|
{
|
|
if (!has_step_allocation(alloc)) return;
|
|
g_step_pool.push_back(alloc);
|
|
alloc = empty_step_allocation();
|
|
}
|
|
|
|
static StepAllocation acquire_step_allocation(size_t all)
|
|
{
|
|
size_t best = g_step_pool.size();
|
|
for (size_t i = 0; i < g_step_pool.size(); ++i) {
|
|
if (g_step_pool[i].cap_all < all) continue;
|
|
if (best == g_step_pool.size() || g_step_pool[i].cap_all < g_step_pool[best].cap_all)
|
|
best = i;
|
|
}
|
|
if (best == g_step_pool.size())
|
|
return empty_step_allocation();
|
|
|
|
StepAllocation alloc = g_step_pool[best];
|
|
g_step_pool[best] = g_step_pool.back();
|
|
g_step_pool.pop_back();
|
|
return alloc;
|
|
}
|
|
|
|
static void ensure_gpu_buffers(int nx, int ny, int nz) {
|
|
size_t all = (size_t)nx * ny * nz;
|
|
size_t fh2_size = (size_t)(nx+2) * (ny+2) * (nz+2);
|
|
size_t fh3_size = (size_t)(nx+3) * (ny+3) * (nz+3);
|
|
const bool need_grow = (!g_buf.initialized)
|
|
|| (all > g_buf.cap_all)
|
|
|| (fh2_size > g_buf.cap_fh2_size)
|
|
|| (fh3_size > g_buf.cap_fh3_size);
|
|
|
|
if (need_grow) {
|
|
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
|
|
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
|
|
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
|
|
if (g_buf.h_stage) {
|
|
if (g_buf.h_stage_pinned) cudaFreeHost(g_buf.h_stage);
|
|
else free(g_buf.h_stage);
|
|
g_buf.h_stage = nullptr;
|
|
g_buf.h_stage_pinned = false;
|
|
}
|
|
|
|
CUDA_CHECK(cudaMalloc(&g_buf.d_mem, NUM_USED_SLOTS * all * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&g_buf.d_fh2, fh2_size * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&g_buf.d_fh3, fh3_size * sizeof(double)));
|
|
|
|
const size_t stage_bytes = (size_t)STAGE_SLOT_COUNT * all * sizeof(double);
|
|
cudaError_t stage_err = cudaMallocHost((void**)&g_buf.h_stage, stage_bytes);
|
|
if (stage_err == cudaSuccess) {
|
|
g_buf.h_stage_pinned = true;
|
|
} else {
|
|
g_buf.h_stage = (double *)malloc(stage_bytes);
|
|
g_buf.h_stage_pinned = false;
|
|
if (!g_buf.h_stage) {
|
|
fprintf(stderr, "Host stage allocation failed (%zu bytes)\n", stage_bytes);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
|
|
g_buf.cap_all = all;
|
|
g_buf.cap_fh2_size = fh2_size;
|
|
g_buf.cap_fh3_size = fh3_size;
|
|
g_buf.initialized = true;
|
|
}
|
|
|
|
for (int s = 0; s < NUM_USED_SLOTS; ++s)
|
|
g_buf.slot[s] = g_buf.d_mem + s * all;
|
|
|
|
g_buf.prev_nx = nx;
|
|
g_buf.prev_ny = ny;
|
|
g_buf.prev_nz = nz;
|
|
}
|
|
|
|
static StepContext &ensure_step_ctx(void *block_tag, size_t all)
|
|
{
|
|
StepContext &ctx = g_step_ctx[block_tag];
|
|
if (ctx.cap_all < all) {
|
|
StepAllocation old_alloc = detach_step_allocation(ctx);
|
|
recycle_step_allocation(old_alloc);
|
|
|
|
StepAllocation alloc = acquire_step_allocation(all);
|
|
if (!has_step_allocation(alloc)) {
|
|
CUDA_CHECK(cudaMalloc(&alloc.d_state0_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&alloc.d_accum_mem, BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
CUDA_CHECK(cudaMalloc(&alloc.d_resident_mem[b],
|
|
BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
|
|
}
|
|
CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&alloc.d_em_source_mem, BSSN_EM_SOURCE_COUNT * all * sizeof(double)));
|
|
alloc.cap_all = all;
|
|
}
|
|
attach_step_allocation(ctx, alloc);
|
|
}
|
|
for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) {
|
|
ctx.d_state0[i] = ctx.d_state0_mem + (size_t)i * all;
|
|
ctx.d_accum[i] = ctx.d_accum_mem + (size_t)i * all;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
ctx.d_resident[b][i] = ctx.d_resident_mem[b] + (size_t)i * all;
|
|
}
|
|
}
|
|
if (ctx.current_bank >= 0) {
|
|
ctx.d_state_curr_mem = ctx.d_resident_mem[ctx.current_bank];
|
|
ctx.d_state_curr = ctx.d_resident[ctx.current_bank];
|
|
}
|
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
|
ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
|
|
}
|
|
for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
|
|
ctx.d_em_source[i] = ctx.d_em_source_mem + (size_t)i * all;
|
|
}
|
|
return ctx;
|
|
}
|
|
|
|
static void release_step_ctx(void *block_tag)
|
|
{
|
|
auto it = g_step_ctx.find(block_tag);
|
|
if (it == g_step_ctx.end()) return;
|
|
StepAllocation alloc = detach_step_allocation(it->second);
|
|
recycle_step_allocation(alloc);
|
|
g_step_ctx.erase(it);
|
|
}
|
|
|
|
static double *ensure_step_comm_buffer(StepContext &ctx, size_t needed_doubles)
|
|
{
|
|
if (needed_doubles == 0) return nullptr;
|
|
if (ctx.cap_comm < needed_doubles) {
|
|
if (ctx.d_comm_mem) {
|
|
CUDA_CHECK(cudaFree(ctx.d_comm_mem));
|
|
ctx.d_comm_mem = nullptr;
|
|
}
|
|
CUDA_CHECK(cudaMalloc(&ctx.d_comm_mem, needed_doubles * sizeof(double)));
|
|
ctx.cap_comm = needed_doubles;
|
|
}
|
|
return ctx.d_comm_mem;
|
|
}
|
|
|
|
static double *ensure_step_host_comm_buffer(StepContext &ctx, size_t needed_doubles)
|
|
{
|
|
if (needed_doubles == 0) return nullptr;
|
|
if (ctx.cap_h_comm < needed_doubles) {
|
|
if (ctx.h_comm_mem) {
|
|
if (ctx.h_comm_pinned) cudaFreeHost(ctx.h_comm_mem);
|
|
else free(ctx.h_comm_mem);
|
|
ctx.h_comm_mem = nullptr;
|
|
ctx.h_comm_pinned = false;
|
|
}
|
|
|
|
const size_t bytes = needed_doubles * sizeof(double);
|
|
cudaError_t err = cudaMallocHost((void **)&ctx.h_comm_mem, bytes);
|
|
if (err == cudaSuccess) {
|
|
ctx.h_comm_pinned = true;
|
|
} else {
|
|
ctx.h_comm_mem = (double *)malloc(bytes);
|
|
ctx.h_comm_pinned = false;
|
|
if (!ctx.h_comm_mem) {
|
|
fprintf(stderr, "Host comm allocation failed (%zu bytes)\n", bytes);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
ctx.cap_h_comm = needed_doubles;
|
|
}
|
|
return ctx.h_comm_mem;
|
|
}
|
|
|
|
static int *ensure_comm_segment_meta_buffer(size_t needed_ints)
|
|
{
|
|
if (needed_ints == 0) return nullptr;
|
|
if (g_comm_segment_meta_cap < needed_ints) {
|
|
if (g_comm_segment_meta) {
|
|
CUDA_CHECK(cudaFree(g_comm_segment_meta));
|
|
g_comm_segment_meta = nullptr;
|
|
}
|
|
CUDA_CHECK(cudaMalloc(&g_comm_segment_meta, needed_ints * sizeof(int)));
|
|
g_comm_segment_meta_cap = needed_ints;
|
|
}
|
|
return g_comm_segment_meta;
|
|
}
|
|
|
|
static void upload_comm_state_soa(const double *state_soa, int state_count)
|
|
{
|
|
double soa[3 * BSSN_RESIDENT_STATE_CAPACITY];
|
|
for (int i = 0; i < BSSN_RESIDENT_STATE_CAPACITY; ++i) {
|
|
soa[3 * i + 0] = 1.0;
|
|
soa[3 * i + 1] = 1.0;
|
|
soa[3 * i + 2] = 1.0;
|
|
}
|
|
if (state_soa) {
|
|
const int n = (state_count < BSSN_RESIDENT_STATE_CAPACITY) ? state_count : BSSN_RESIDENT_STATE_CAPACITY;
|
|
for (int i = 0; i < n; ++i) {
|
|
soa[3 * i + 0] = state_soa[3 * i + 0];
|
|
soa[3 * i + 1] = state_soa[3 * i + 1];
|
|
soa[3 * i + 2] = state_soa[3 * i + 2];
|
|
}
|
|
}
|
|
CUDA_CHECK(cudaMemcpyToSymbol(d_comm_state_soa, soa, sizeof(soa)));
|
|
}
|
|
|
|
static void upload_grid_params_if_needed(const GridParams &gp)
|
|
{
|
|
if (!g_gp_host_cache_valid ||
|
|
std::memcmp(&g_gp_host_cache, &gp, sizeof(GridParams)) != 0) {
|
|
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
|
g_gp_host_cache = gp;
|
|
g_gp_host_cache_valid = true;
|
|
}
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* A. Symmetry boundary kernels (ord=2 and ord=3) */
|
|
/* ================================================================== */
|
|
|
|
/* Step 1: Copy interior into ghost-padded array */
|
|
__global__ void kern_symbd_copy_interior_ord2(const double * __restrict__ func,
|
|
double * __restrict__ fh,
|
|
double SoA0, double SoA1, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
fh[(iF+1) + (jF+1)*fnx + (kF+1)*fnx*fny] = func[tid];
|
|
}
|
|
}
|
|
|
|
/* Fused symmetry pack (ord=2): fill full fh from interior func in one pass. */
|
|
__global__ void kern_symbd_pack_ord2(const double * __restrict__ func,
|
|
double * __restrict__ fh,
|
|
double SoA0, double SoA1, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny, fnz = d_gp.fh2_nz;
|
|
const int total = fnx * fny * fnz;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int ii = tid % fnx;
|
|
int jj = (tid / fnx) % fny;
|
|
int kk = tid / (fnx * fny);
|
|
|
|
int iF = ii - 1; /* -1 .. nx */
|
|
int jF = jj - 1; /* -1 .. ny */
|
|
int kF = kk - 1; /* -1 .. nz */
|
|
|
|
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
|
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
|
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
|
|
|
double sign = 1.0;
|
|
if (iF <= 0) sign *= SoA0;
|
|
if (jF <= 0) sign *= SoA1;
|
|
if (kF <= 0) sign *= SoA2;
|
|
|
|
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
|
fh[tid] = sign * func[src];
|
|
}
|
|
}
|
|
|
|
/* Step 2: Fill i-ghosts (x-direction symmetry) */
|
|
__global__ void kern_symbd_ighost_ord2(double * __restrict__ fh, double SoA0)
|
|
{
|
|
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
/* ord=2: fill iF=0 and iF=-1, i.e. ghost layers ii=0 from ii=2, ii=1 from ii=1 */
|
|
/* Fortran: do ii=0,ord-1: funcc(-ii,jF,kF) = funcc(ii+1,jF,kF)*SoA[0] */
|
|
int total = ny * nz; /* jF=1..ny, kF=1..nz */
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 2; /* 2 ghost layers */
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int ii = tid / total; /* 0 or 1 */
|
|
int rem = tid % total;
|
|
int j0 = rem % ny;
|
|
int k0 = rem / ny;
|
|
int jF = j0 + 1, kF = k0 + 1;
|
|
int iF_dst = -ii; /* 0, -1 */
|
|
int iF_src = ii + 1; /* 1, 2 */
|
|
fh[(iF_dst+1) + (jF+1)*fnx + (kF+1)*fnx*fny] =
|
|
fh[(iF_src+1) + (jF+1)*fnx + (kF+1)*fnx*fny] * SoA0;
|
|
}
|
|
}
|
|
|
|
/* Step 3: Fill j-ghosts (y-direction symmetry) */
|
|
__global__ void kern_symbd_jghost_ord2(double * __restrict__ fh, double SoA1)
|
|
{
|
|
const int nx = d_gp.ex[0], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
/* iF ranges from -1 to nx (i.e. -ord+1 to ex1), total = nx+2 */
|
|
int irange = nx + 2;
|
|
int total = irange * nz;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 2;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int jj = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int k0 = rem / irange;
|
|
int iF = ii - 1; /* -1 .. nx */
|
|
int kF = k0 + 1;
|
|
int jF_dst = -jj;
|
|
int jF_src = jj + 1;
|
|
fh[(iF+1) + (jF_dst+1)*fnx + (kF+1)*fnx*fny] =
|
|
fh[(iF+1) + (jF_src+1)*fnx + (kF+1)*fnx*fny] * SoA1;
|
|
}
|
|
}
|
|
|
|
/* Step 4: Fill k-ghosts (z-direction symmetry) */
|
|
__global__ void kern_symbd_kghost_ord2(double * __restrict__ fh, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
|
const int fnx = d_gp.fh2_nx, fny = d_gp.fh2_ny;
|
|
int irange = nx + 2;
|
|
int jrange = ny + 2;
|
|
int total = irange * jrange;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 2;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int kk = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int jj = rem / irange;
|
|
int iF = ii - 1;
|
|
int jF = jj - 1;
|
|
int kF_dst = -kk;
|
|
int kF_src = kk + 1;
|
|
fh[(iF+1) + (jF+1)*fnx + (kF_dst+1)*fnx*fny] =
|
|
fh[(iF+1) + (jF+1)*fnx + (kF_src+1)*fnx*fny] * SoA2;
|
|
}
|
|
}
|
|
|
|
/* ---- ord=3 variants (for lopsided / kodis) ---- */
|
|
|
|
__global__ void kern_symbd_copy_interior_ord3(const double * __restrict__ func,
|
|
double * __restrict__ fh)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
fh[(iF+2) + (jF+2)*fnx + (kF+2)*fnx*fny] = func[tid];
|
|
}
|
|
}
|
|
|
|
/* Fused symmetry pack (ord=3): fill full fh from interior func in one pass. */
|
|
__global__ void kern_symbd_pack_ord3(const double * __restrict__ func,
|
|
double * __restrict__ fh,
|
|
double SoA0, double SoA1, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny, fnz = d_gp.fh3_nz;
|
|
const int total = fnx * fny * fnz;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int ii = tid % fnx;
|
|
int jj = (tid / fnx) % fny;
|
|
int kk = tid / (fnx * fny);
|
|
|
|
int iF = ii - 2; /* -2 .. nx */
|
|
int jF = jj - 2; /* -2 .. ny */
|
|
int kF = kk - 2; /* -2 .. nz */
|
|
|
|
int siF = (iF <= 0) ? (1 - iF) : iF; /* 1..nx */
|
|
int sjF = (jF <= 0) ? (1 - jF) : jF; /* 1..ny */
|
|
int skF = (kF <= 0) ? (1 - kF) : kF; /* 1..nz */
|
|
|
|
double sign = 1.0;
|
|
if (iF <= 0) sign *= SoA0;
|
|
if (jF <= 0) sign *= SoA1;
|
|
if (kF <= 0) sign *= SoA2;
|
|
|
|
int src = (siF - 1) + (sjF - 1) * nx + (skF - 1) * nx * ny;
|
|
fh[tid] = sign * func[src];
|
|
}
|
|
}
|
|
|
|
__global__ void kern_symbd_ighost_ord3(double * __restrict__ fh, double SoA0)
|
|
{
|
|
const int ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
int total = ny * nz;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int ii = tid / total;
|
|
int rem = tid % total;
|
|
int j0 = rem % ny;
|
|
int k0 = rem / ny;
|
|
int jF = j0 + 1, kF = k0 + 1;
|
|
int iF_dst = -ii;
|
|
int iF_src = ii + 1;
|
|
fh[(iF_dst+2) + (jF+2)*fnx + (kF+2)*fnx*fny] =
|
|
fh[(iF_src+2) + (jF+2)*fnx + (kF+2)*fnx*fny] * SoA0;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_symbd_jghost_ord3(double * __restrict__ fh, double SoA1)
|
|
{
|
|
const int nx = d_gp.ex[0], nz = d_gp.ex[2];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
int irange = nx + 3;
|
|
int total = irange * nz;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int jj = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int k0 = rem / irange;
|
|
int iF = ii - 2;
|
|
int kF = k0 + 1;
|
|
int jF_dst = -jj;
|
|
int jF_src = jj + 1;
|
|
fh[(iF+2) + (jF_dst+2)*fnx + (kF+2)*fnx*fny] =
|
|
fh[(iF+2) + (jF_src+2)*fnx + (kF+2)*fnx*fny] * SoA1;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_symbd_kghost_ord3(double * __restrict__ fh, double SoA2)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1];
|
|
const int fnx = d_gp.fh3_nx, fny = d_gp.fh3_ny;
|
|
int irange = nx + 3;
|
|
int jrange = ny + 3;
|
|
int total = irange * jrange;
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < total * 3;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int kk = tid / total;
|
|
int rem = tid % total;
|
|
int ii = rem % irange;
|
|
int jj = rem / irange;
|
|
int iF = ii - 2;
|
|
int jF = jj - 2;
|
|
int kF_dst = -kk;
|
|
int kF_src = kk + 1;
|
|
fh[(iF+2) + (jF+2)*fnx + (kF_dst+2)*fnx*fny] =
|
|
fh[(iF+2) + (jF+2)*fnx + (kF_src+2)*fnx*fny] * SoA2;
|
|
}
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* B. Stencil kernels */
|
|
/* ================================================================== */
|
|
|
|
/* ---- First derivatives (ord=2, 4th/2nd order) ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_fderivs(const double * __restrict__ fh,
|
|
double * __restrict__ fx,
|
|
double * __restrict__ fy,
|
|
double * __restrict__ fz)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
|
|
/* boundary points: leave as zero */
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0;
|
|
continue;
|
|
}
|
|
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
|
|
if ((iF+2) <= imaxF && (iF-2) >= iminF &&
|
|
(jF+2) <= jmaxF && (jF-2) >= jminF &&
|
|
(kF+2) <= kmaxF && (kF-2) >= kminF)
|
|
{
|
|
fx[tid] = d_gp.d12dx * (
|
|
fh[idx_fh2(iF-2,jF,kF)] - 8.0*fh[idx_fh2(iF-1,jF,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]);
|
|
fy[tid] = d_gp.d12dy * (
|
|
fh[idx_fh2(iF,jF-2,kF)] - 8.0*fh[idx_fh2(iF,jF-1,kF)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]);
|
|
fz[tid] = d_gp.d12dz * (
|
|
fh[idx_fh2(iF,jF,kF-2)] - 8.0*fh[idx_fh2(iF,jF,kF-1)]
|
|
+ 8.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]);
|
|
}
|
|
else if ((iF+1) <= imaxF && (iF-1) >= iminF &&
|
|
(jF+1) <= jmaxF && (jF-1) >= jminF &&
|
|
(kF+1) <= kmaxF && (kF-1) >= kminF)
|
|
{
|
|
fx[tid] = d_gp.d2dx * (
|
|
-fh[idx_fh2(iF-1,jF,kF)] + fh[idx_fh2(iF+1,jF,kF)]);
|
|
fy[tid] = d_gp.d2dy * (
|
|
-fh[idx_fh2(iF,jF-1,kF)] + fh[idx_fh2(iF,jF+1,kF)]);
|
|
fz[tid] = d_gp.d2dz * (
|
|
-fh[idx_fh2(iF,jF,kF-1)] + fh[idx_fh2(iF,jF,kF+1)]);
|
|
}
|
|
else {
|
|
fx[tid] = 0.0; fy[tid] = 0.0; fz[tid] = 0.0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ---- Second derivatives (ord=2, 4th/2nd order) ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_fdderivs(const double * __restrict__ fh,
|
|
double * __restrict__ fxx, double * __restrict__ fxy,
|
|
double * __restrict__ fxz, double * __restrict__ fyy,
|
|
double * __restrict__ fyz, double * __restrict__ fzz)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fxx[tid]=0; fxy[tid]=0; fxz[tid]=0;
|
|
fyy[tid]=0; fyz[tid]=0; fzz[tid]=0;
|
|
continue;
|
|
}
|
|
|
|
int iF = i0+1, jF = j0+1, kF = k0+1;
|
|
|
|
if ((iF+2)<=imaxF && (iF-2)>=iminF &&
|
|
(jF+2)<=jmaxF && (jF-2)>=jminF &&
|
|
(kF+2)<=kmaxF && (kF-2)>=kminF)
|
|
{
|
|
/* 4th-order diagonal */
|
|
double c = fh[idx_fh2(iF,jF,kF)];
|
|
fxx[tid] = d_gp.Fdxdx*(
|
|
-fh[idx_fh2(iF-2,jF,kF)] + 16.0*fh[idx_fh2(iF-1,jF,kF)]
|
|
-30.0*c + 16.0*fh[idx_fh2(iF+1,jF,kF)] - fh[idx_fh2(iF+2,jF,kF)]);
|
|
fyy[tid] = d_gp.Fdydy*(
|
|
-fh[idx_fh2(iF,jF-2,kF)] + 16.0*fh[idx_fh2(iF,jF-1,kF)]
|
|
-30.0*c + 16.0*fh[idx_fh2(iF,jF+1,kF)] - fh[idx_fh2(iF,jF+2,kF)]);
|
|
fzz[tid] = d_gp.Fdzdz*(
|
|
-fh[idx_fh2(iF,jF,kF-2)] + 16.0*fh[idx_fh2(iF,jF,kF-1)]
|
|
-30.0*c + 16.0*fh[idx_fh2(iF,jF,kF+1)] - fh[idx_fh2(iF,jF,kF+2)]);
|
|
|
|
/* 4th-order cross: fxy */
|
|
{
|
|
double t_jm2 = fh[idx_fh2(iF-2,jF-2,kF)] - 8.0*fh[idx_fh2(iF-1,jF-2,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF-2,kF)] - fh[idx_fh2(iF+2,jF-2,kF)];
|
|
double t_jm1 = fh[idx_fh2(iF-2,jF-1,kF)] - 8.0*fh[idx_fh2(iF-1,jF-1,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF-1,kF)] - fh[idx_fh2(iF+2,jF-1,kF)];
|
|
double t_jp1 = fh[idx_fh2(iF-2,jF+1,kF)] - 8.0*fh[idx_fh2(iF-1,jF+1,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF+1,kF)] - fh[idx_fh2(iF+2,jF+1,kF)];
|
|
double t_jp2 = fh[idx_fh2(iF-2,jF+2,kF)] - 8.0*fh[idx_fh2(iF-1,jF+2,kF)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF+2,kF)] - fh[idx_fh2(iF+2,jF+2,kF)];
|
|
fxy[tid] = d_gp.Fdxdy*(t_jm2 - 8.0*t_jm1 + 8.0*t_jp1 - t_jp2);
|
|
}
|
|
/* 4th-order cross: fxz */
|
|
{
|
|
double t_km2 = fh[idx_fh2(iF-2,jF,kF-2)] - 8.0*fh[idx_fh2(iF-1,jF,kF-2)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF-2)] - fh[idx_fh2(iF+2,jF,kF-2)];
|
|
double t_km1 = fh[idx_fh2(iF-2,jF,kF-1)] - 8.0*fh[idx_fh2(iF-1,jF,kF-1)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF-1)] - fh[idx_fh2(iF+2,jF,kF-1)];
|
|
double t_kp1 = fh[idx_fh2(iF-2,jF,kF+1)] - 8.0*fh[idx_fh2(iF-1,jF,kF+1)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF+1)] - fh[idx_fh2(iF+2,jF,kF+1)];
|
|
double t_kp2 = fh[idx_fh2(iF-2,jF,kF+2)] - 8.0*fh[idx_fh2(iF-1,jF,kF+2)]
|
|
+ 8.0*fh[idx_fh2(iF+1,jF,kF+2)] - fh[idx_fh2(iF+2,jF,kF+2)];
|
|
fxz[tid] = d_gp.Fdxdz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2);
|
|
}
|
|
/* 4th-order cross: fyz */
|
|
{
|
|
double t_km2 = fh[idx_fh2(iF,jF-2,kF-2)] - 8.0*fh[idx_fh2(iF,jF-1,kF-2)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF-2)] - fh[idx_fh2(iF,jF+2,kF-2)];
|
|
double t_km1 = fh[idx_fh2(iF,jF-2,kF-1)] - 8.0*fh[idx_fh2(iF,jF-1,kF-1)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF-1)] - fh[idx_fh2(iF,jF+2,kF-1)];
|
|
double t_kp1 = fh[idx_fh2(iF,jF-2,kF+1)] - 8.0*fh[idx_fh2(iF,jF-1,kF+1)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF+1)] - fh[idx_fh2(iF,jF+2,kF+1)];
|
|
double t_kp2 = fh[idx_fh2(iF,jF-2,kF+2)] - 8.0*fh[idx_fh2(iF,jF-1,kF+2)]
|
|
+ 8.0*fh[idx_fh2(iF,jF+1,kF+2)] - fh[idx_fh2(iF,jF+2,kF+2)];
|
|
fyz[tid] = d_gp.Fdydz*(t_km2 - 8.0*t_km1 + 8.0*t_kp1 - t_kp2);
|
|
}
|
|
}
|
|
else if ((iF+1)<=imaxF && (iF-1)>=iminF &&
|
|
(jF+1)<=jmaxF && (jF-1)>=jminF &&
|
|
(kF+1)<=kmaxF && (kF-1)>=kminF)
|
|
{
|
|
double c = fh[idx_fh2(iF,jF,kF)];
|
|
fxx[tid] = d_gp.Sdxdx*(fh[idx_fh2(iF-1,jF,kF)] - 2.0*c + fh[idx_fh2(iF+1,jF,kF)]);
|
|
fyy[tid] = d_gp.Sdydy*(fh[idx_fh2(iF,jF-1,kF)] - 2.0*c + fh[idx_fh2(iF,jF+1,kF)]);
|
|
fzz[tid] = d_gp.Sdzdz*(fh[idx_fh2(iF,jF,kF-1)] - 2.0*c + fh[idx_fh2(iF,jF,kF+1)]);
|
|
fxy[tid] = d_gp.Sdxdy*(fh[idx_fh2(iF-1,jF-1,kF)] - fh[idx_fh2(iF+1,jF-1,kF)]
|
|
-fh[idx_fh2(iF-1,jF+1,kF)] + fh[idx_fh2(iF+1,jF+1,kF)]);
|
|
fxz[tid] = d_gp.Sdxdz*(fh[idx_fh2(iF-1,jF,kF-1)] - fh[idx_fh2(iF+1,jF,kF-1)]
|
|
-fh[idx_fh2(iF-1,jF,kF+1)] + fh[idx_fh2(iF+1,jF,kF+1)]);
|
|
fyz[tid] = d_gp.Sdydz*(fh[idx_fh2(iF,jF-1,kF-1)] - fh[idx_fh2(iF,jF+1,kF-1)]
|
|
-fh[idx_fh2(iF,jF-1,kF+1)] + fh[idx_fh2(iF,jF+1,kF+1)]);
|
|
}
|
|
else {
|
|
fxx[tid]=0; fxy[tid]=0; fxz[tid]=0;
|
|
fyy[tid]=0; fyz[tid]=0; fzz[tid]=0;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ---- Lopsided (upwind advection) kernel ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_lopsided(const double * __restrict__ fh,
|
|
double * __restrict__ f_rhs,
|
|
const double * __restrict__ Sfx,
|
|
const double * __restrict__ Sfy,
|
|
const double * __restrict__ Sfz)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) continue;
|
|
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
double val = 0.0;
|
|
|
|
/* --- x direction --- */
|
|
double sfx = Sfx[tid];
|
|
if (sfx > 0.0) {
|
|
if (i0 <= nx - 4) {
|
|
val += sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)]
|
|
+ fh[idx_fh3(iF+3,jF,kF)]);
|
|
} else if (i0 <= nx - 3) {
|
|
val += sfx * d_gp.d12dx * (
|
|
fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)]
|
|
+8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]);
|
|
} else if (i0 <= nx - 2) {
|
|
val -= sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)]
|
|
+ fh[idx_fh3(iF-3,jF,kF)]);
|
|
}
|
|
} else if (sfx < 0.0) {
|
|
if ((i0 - 2) >= iminF) {
|
|
val -= sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF+1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF-1,jF,kF)] - 6.0*fh[idx_fh3(iF-2,jF,kF)]
|
|
+ fh[idx_fh3(iF-3,jF,kF)]);
|
|
} else if ((i0 - 1) >= iminF) {
|
|
val += sfx * d_gp.d12dx * (
|
|
fh[idx_fh3(iF-2,jF,kF)] - 8.0*fh[idx_fh3(iF-1,jF,kF)]
|
|
+8.0*fh[idx_fh3(iF+1,jF,kF)] - fh[idx_fh3(iF+2,jF,kF)]);
|
|
} else if (i0 >= iminF) {
|
|
val += sfx * d_gp.d12dx * (
|
|
-3.0*fh[idx_fh3(iF-1,jF,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF+1,jF,kF)] - 6.0*fh[idx_fh3(iF+2,jF,kF)]
|
|
+ fh[idx_fh3(iF+3,jF,kF)]);
|
|
}
|
|
}
|
|
|
|
/* --- y direction --- */
|
|
double sfy = Sfy[tid];
|
|
if (sfy > 0.0) {
|
|
if (j0 <= ny - 4) {
|
|
val += sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)]
|
|
+ fh[idx_fh3(iF,jF+3,kF)]);
|
|
} else if (j0 <= ny - 3) {
|
|
val += sfy * d_gp.d12dy * (
|
|
fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)]
|
|
+8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]);
|
|
} else if (j0 <= ny - 2) {
|
|
val -= sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)]
|
|
+ fh[idx_fh3(iF,jF-3,kF)]);
|
|
}
|
|
} else if (sfy < 0.0) {
|
|
if ((j0 - 2) >= jminF) {
|
|
val -= sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF+1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF-1,kF)] - 6.0*fh[idx_fh3(iF,jF-2,kF)]
|
|
+ fh[idx_fh3(iF,jF-3,kF)]);
|
|
} else if ((j0 - 1) >= jminF) {
|
|
val += sfy * d_gp.d12dy * (
|
|
fh[idx_fh3(iF,jF-2,kF)] - 8.0*fh[idx_fh3(iF,jF-1,kF)]
|
|
+8.0*fh[idx_fh3(iF,jF+1,kF)] - fh[idx_fh3(iF,jF+2,kF)]);
|
|
} else if (j0 >= jminF) {
|
|
val += sfy * d_gp.d12dy * (
|
|
-3.0*fh[idx_fh3(iF,jF-1,kF)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF+1,kF)] - 6.0*fh[idx_fh3(iF,jF+2,kF)]
|
|
+ fh[idx_fh3(iF,jF+3,kF)]);
|
|
}
|
|
}
|
|
|
|
/* --- z direction --- */
|
|
double sfz = Sfz[tid];
|
|
if (sfz > 0.0) {
|
|
if (k0 <= nz - 4) {
|
|
val += sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)]
|
|
+ fh[idx_fh3(iF,jF,kF+3)]);
|
|
} else if (k0 <= nz - 3) {
|
|
val += sfz * d_gp.d12dz * (
|
|
fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)]
|
|
+8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]);
|
|
} else if (k0 <= nz - 2) {
|
|
val -= sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)]
|
|
+ fh[idx_fh3(iF,jF,kF-3)]);
|
|
}
|
|
} else if (sfz < 0.0) {
|
|
if ((k0 - 2) >= kminF) {
|
|
val -= sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF+1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF-1)] - 6.0*fh[idx_fh3(iF,jF,kF-2)]
|
|
+ fh[idx_fh3(iF,jF,kF-3)]);
|
|
} else if ((k0 - 1) >= kminF) {
|
|
val += sfz * d_gp.d12dz * (
|
|
fh[idx_fh3(iF,jF,kF-2)] - 8.0*fh[idx_fh3(iF,jF,kF-1)]
|
|
+8.0*fh[idx_fh3(iF,jF,kF+1)] - fh[idx_fh3(iF,jF,kF+2)]);
|
|
} else if (k0 >= kminF) {
|
|
val += sfz * d_gp.d12dz * (
|
|
-3.0*fh[idx_fh3(iF,jF,kF-1)] - 10.0*fh[idx_fh3(iF,jF,kF)]
|
|
+18.0*fh[idx_fh3(iF,jF,kF+1)] - 6.0*fh[idx_fh3(iF,jF,kF+2)]
|
|
+ fh[idx_fh3(iF,jF,kF+3)]);
|
|
}
|
|
}
|
|
|
|
f_rhs[tid] += val;
|
|
}
|
|
}
|
|
|
|
/* ---- KO dissipation kernel (ord=3, 6th-order) ---- */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_kodis(const double * __restrict__ fh,
|
|
double * __restrict__ f_rhs,
|
|
double eps_val)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const double cof = 64.0;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < d_gp.all;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = tid % nx;
|
|
int j0 = (tid / nx) % ny;
|
|
int k0 = tid / (nx * ny);
|
|
int iF = i0 + 1, jF = j0 + 1, kF = k0 + 1;
|
|
|
|
if ((iF-3) >= iminF && (iF+3) <= imaxF &&
|
|
(jF-3) >= jminF && (jF+3) <= jmaxF &&
|
|
(kF-3) >= kminF && (kF+3) <= kmaxF)
|
|
{
|
|
double Dx = (fh[idx_fh3(iF-3,jF,kF)] + fh[idx_fh3(iF+3,jF,kF)])
|
|
- 6.0*(fh[idx_fh3(iF-2,jF,kF)] + fh[idx_fh3(iF+2,jF,kF)])
|
|
+15.0*(fh[idx_fh3(iF-1,jF,kF)] + fh[idx_fh3(iF+1,jF,kF)])
|
|
-20.0* fh[idx_fh3(iF,jF,kF)];
|
|
Dx /= d_gp.dX;
|
|
|
|
double Dy = (fh[idx_fh3(iF,jF-3,kF)] + fh[idx_fh3(iF,jF+3,kF)])
|
|
- 6.0*(fh[idx_fh3(iF,jF-2,kF)] + fh[idx_fh3(iF,jF+2,kF)])
|
|
+15.0*(fh[idx_fh3(iF,jF-1,kF)] + fh[idx_fh3(iF,jF+1,kF)])
|
|
-20.0* fh[idx_fh3(iF,jF,kF)];
|
|
Dy /= d_gp.dY;
|
|
|
|
double Dz = (fh[idx_fh3(iF,jF,kF-3)] + fh[idx_fh3(iF,jF,kF+3)])
|
|
- 6.0*(fh[idx_fh3(iF,jF,kF-2)] + fh[idx_fh3(iF,jF,kF+2)])
|
|
+15.0*(fh[idx_fh3(iF,jF,kF-1)] + fh[idx_fh3(iF,jF,kF+1)])
|
|
-20.0* fh[idx_fh3(iF,jF,kF)];
|
|
Dz /= d_gp.dZ;
|
|
|
|
f_rhs[tid] += (eps_val / cof) * (Dx + Dy + Dz);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* Host wrapper helpers */
|
|
/* ================================================================== */
|
|
struct LopsidedKodisTables {
|
|
const double *adv_fields[BSSN_EM_LK_FIELD_COUNT];
|
|
const double *ko_fields[BSSN_EM_LK_FIELD_COUNT];
|
|
double *rhs_fields[BSSN_EM_LK_FIELD_COUNT];
|
|
int soa_signs[3 * BSSN_EM_LK_FIELD_COUNT];
|
|
};
|
|
|
|
struct FDerivTables {
|
|
const double *src_fields[BSSN_STATE_COUNT];
|
|
double *fx_fields[BSSN_STATE_COUNT];
|
|
double *fy_fields[BSSN_STATE_COUNT];
|
|
double *fz_fields[BSSN_STATE_COUNT];
|
|
int soa_signs[3 * BSSN_STATE_COUNT];
|
|
};
|
|
|
|
struct FDDerivTables {
|
|
const double *src_fields[BSSN_STATE_COUNT];
|
|
double *fxx_fields[BSSN_STATE_COUNT];
|
|
double *fxy_fields[BSSN_STATE_COUNT];
|
|
double *fxz_fields[BSSN_STATE_COUNT];
|
|
double *fyy_fields[BSSN_STATE_COUNT];
|
|
double *fyz_fields[BSSN_STATE_COUNT];
|
|
double *fzz_fields[BSSN_STATE_COUNT];
|
|
int soa_signs[3 * BSSN_STATE_COUNT];
|
|
};
|
|
|
|
static constexpr int PHASE10_METRIC_FIELD_COUNT = 6;
|
|
|
|
struct Phase10RicciTables {
|
|
const double *src_fields[PHASE10_METRIC_FIELD_COUNT];
|
|
double *dst_fields[PHASE10_METRIC_FIELD_COUNT];
|
|
int soa_signs[3 * PHASE10_METRIC_FIELD_COUNT];
|
|
};
|
|
|
|
struct Rk4FinalizeTables {
|
|
const double *f0_fields[BSSN_EM_STATE_COUNT];
|
|
double *rhs_fields[BSSN_EM_STATE_COUNT];
|
|
double *accum_fields[BSSN_EM_STATE_COUNT];
|
|
};
|
|
|
|
struct PatchBoundaryTables {
|
|
const double *src_fields[BSSN_EM_STATE_COUNT];
|
|
double *dst_fields[BSSN_EM_STATE_COUNT];
|
|
};
|
|
|
|
struct EScalarBoundaryTables {
|
|
const double *f0_fields[BSSN_EM_STATE_COUNT];
|
|
double *out_fields[BSSN_EM_STATE_COUNT];
|
|
};
|
|
|
|
static const int BLK = 128;
|
|
static inline int grid(size_t n) {
|
|
if (n == 0) return 1;
|
|
size_t g = (n + BLK - 1) / BLK;
|
|
if (g > 2147483647u) g = 2147483647u;
|
|
return (int)g;
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_fderivs_batched(FDerivTables tables, int field_count)
|
|
{
|
|
const int field = blockIdx.y;
|
|
if (field >= field_count) return;
|
|
|
|
const double *src = tables.src_fields[field];
|
|
double *fx = tables.fx_fields[field];
|
|
double *fy = tables.fy_fields[field];
|
|
double *fz = tables.fz_fields[field];
|
|
const int SoA0 = tables.soa_signs[3 * field + 0];
|
|
const int SoA1 = tables.soa_signs[3 * field + 1];
|
|
const int SoA2 = tables.soa_signs[3 * field + 2];
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fx[tid] = 0.0;
|
|
fy[tid] = 0.0;
|
|
fz[tid] = 0.0;
|
|
return;
|
|
}
|
|
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
|
|
fd_compute_first3(src, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
SoA0, SoA1, SoA2,
|
|
fx[tid], fy[tid], fz[tid]);
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
|
{
|
|
const int field = blockIdx.y;
|
|
if (field >= field_count) return;
|
|
|
|
const double *src = tables.src_fields[field];
|
|
double *fxx = tables.fxx_fields[field];
|
|
double *fxy = tables.fxy_fields[field];
|
|
double *fxz = tables.fxz_fields[field];
|
|
double *fyy = tables.fyy_fields[field];
|
|
double *fyz = tables.fyz_fields[field];
|
|
double *fzz = tables.fzz_fields[field];
|
|
const int SoA0 = tables.soa_signs[3 * field + 0];
|
|
const int SoA1 = tables.soa_signs[3 * field + 1];
|
|
const int SoA2 = tables.soa_signs[3 * field + 2];
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
|
return;
|
|
}
|
|
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
|
|
#if ghost_width != 3
|
|
fd_compute_second6(src, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
SoA0, SoA1, SoA2,
|
|
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
|
#else
|
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
fxx[tid] = d_gp.Fdxdx * (
|
|
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
|
fyy[tid] = d_gp.Fdydy * (
|
|
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
|
fzz[tid] = d_gp.Fdzdz * (
|
|
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
|
|
|
const double t_jm2 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
|
|
const double t_jm1 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
|
|
const double t_jp1 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
|
|
const double t_jp2 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
|
|
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
|
|
|
const double t_km2_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
|
|
const double t_km1_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
|
|
const double t_kp1_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
|
|
const double t_kp2_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
|
|
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
|
|
|
const double t_km2_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
|
|
const double t_km1_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
|
|
const double t_kp1_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
|
|
const double t_kp2_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
|
|
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
|
}
|
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
fxx[tid] = d_gp.Sdxdx * (
|
|
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
|
fyy[tid] = d_gp.Sdydy * (
|
|
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
|
fzz[tid] = d_gp.Sdzdz * (
|
|
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
|
fxy[tid] = d_gp.Sdxdy * (
|
|
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
|
|
fxz[tid] = d_gp.Sdxdz * (
|
|
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
|
|
fyz[tid] = d_gp.Sdydz * (
|
|
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
|
}
|
|
else {
|
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void gpu_fderivs_batch(int field_count,
|
|
double *const *src_fields,
|
|
double *const *fx_fields,
|
|
double *const *fy_fields,
|
|
double *const *fz_fields,
|
|
const int *soa_signs,
|
|
int all);
|
|
static void gpu_fdderivs_batch(int field_count,
|
|
double *const *src_fields,
|
|
double *const *fxx_fields,
|
|
double *const *fxy_fields,
|
|
double *const *fxz_fields,
|
|
double *const *fyy_fields,
|
|
double *const *fyz_fields,
|
|
double *const *fzz_fields,
|
|
const int *soa_signs,
|
|
int all);
|
|
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
|
double SoA0, double SoA1, double SoA2,
|
|
double eps_val, int all);
|
|
|
|
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
|
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
|
double SoA0, double SoA1, double SoA2, int all)
|
|
{
|
|
#if ghost_width != 3
|
|
double *src_fields[1] = {d_f};
|
|
double *fx_fields[1] = {d_fx};
|
|
double *fy_fields[1] = {d_fy};
|
|
double *fz_fields[1] = {d_fz};
|
|
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
|
gpu_fderivs_batch(1, src_fields, fx_fields, fy_fields, fz_fields, soa_signs, all);
|
|
#else
|
|
double *fh = g_buf.d_fh2;
|
|
const size_t nx = (size_t)g_buf.prev_nx;
|
|
const size_t ny = (size_t)g_buf.prev_ny;
|
|
const size_t nz = (size_t)g_buf.prev_nz;
|
|
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
|
|
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
|
kern_fderivs<<<grid(all), BLK>>>(fh, d_fx, d_fy, d_fz);
|
|
#endif
|
|
}
|
|
|
|
/* symmetry_bd on GPU for ord=2, then launch fdderivs kernel */
|
|
static void gpu_fdderivs(double *d_f,
|
|
double *d_fxx, double *d_fxy, double *d_fxz,
|
|
double *d_fyy, double *d_fyz, double *d_fzz,
|
|
double SoA0, double SoA1, double SoA2, int all)
|
|
{
|
|
#if ghost_width != 3
|
|
double *src_fields[1] = {d_f};
|
|
double *fxx_fields[1] = {d_fxx};
|
|
double *fxy_fields[1] = {d_fxy};
|
|
double *fxz_fields[1] = {d_fxz};
|
|
double *fyy_fields[1] = {d_fyy};
|
|
double *fyz_fields[1] = {d_fyz};
|
|
double *fzz_fields[1] = {d_fzz};
|
|
const int soa_signs[3] = {(int)SoA0, (int)SoA1, (int)SoA2};
|
|
gpu_fdderivs_batch(1, src_fields, fxx_fields, fxy_fields, fxz_fields,
|
|
fyy_fields, fyz_fields, fzz_fields, soa_signs, all);
|
|
#else
|
|
double *fh = g_buf.d_fh2;
|
|
const size_t nx = (size_t)g_buf.prev_nx;
|
|
const size_t ny = (size_t)g_buf.prev_ny;
|
|
const size_t nz = (size_t)g_buf.prev_nz;
|
|
const size_t w_pack = (nx + 2ull) * (ny + 2ull) * (nz + 2ull);
|
|
|
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f, fh, SoA0, SoA1, SoA2);
|
|
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
|
#endif
|
|
}
|
|
|
|
static void gpu_fderivs_batch(int field_count,
|
|
double *const *src_fields,
|
|
double *const *fx_fields,
|
|
double *const *fy_fields,
|
|
double *const *fz_fields,
|
|
const int *soa_signs,
|
|
int all)
|
|
{
|
|
if (field_count <= 0) return;
|
|
FDerivTables tables = {};
|
|
for (int i = 0; i < field_count; ++i) {
|
|
tables.src_fields[i] = src_fields[i];
|
|
tables.fx_fields[i] = fx_fields[i];
|
|
tables.fy_fields[i] = fy_fields[i];
|
|
tables.fz_fields[i] = fz_fields[i];
|
|
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
|
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
|
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
|
}
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
|
|
kern_fderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
|
}
|
|
|
|
static void gpu_fdderivs_batch(int field_count,
|
|
double *const *src_fields,
|
|
double *const *fxx_fields,
|
|
double *const *fxy_fields,
|
|
double *const *fxz_fields,
|
|
double *const *fyy_fields,
|
|
double *const *fyz_fields,
|
|
double *const *fzz_fields,
|
|
const int *soa_signs,
|
|
int all)
|
|
{
|
|
if (field_count <= 0) return;
|
|
FDDerivTables tables = {};
|
|
for (int i = 0; i < field_count; ++i) {
|
|
tables.src_fields[i] = src_fields[i];
|
|
tables.fxx_fields[i] = fxx_fields[i];
|
|
tables.fxy_fields[i] = fxy_fields[i];
|
|
tables.fxz_fields[i] = fxz_fields[i];
|
|
tables.fyy_fields[i] = fyy_fields[i];
|
|
tables.fyz_fields[i] = fyz_fields[i];
|
|
tables.fzz_fields[i] = fzz_fields[i];
|
|
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
|
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
|
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
|
}
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
|
|
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase10_ricci_batched(const double * __restrict__ gupxx,
|
|
const double * __restrict__ gupxy,
|
|
const double * __restrict__ gupxz,
|
|
const double * __restrict__ gupyy,
|
|
const double * __restrict__ gupyz,
|
|
const double * __restrict__ gupzz,
|
|
Phase10RicciTables tables)
|
|
{
|
|
const int field = blockIdx.y;
|
|
if (field >= PHASE10_METRIC_FIELD_COUNT) return;
|
|
|
|
const double *src = tables.src_fields[field];
|
|
double *dst = tables.dst_fields[field];
|
|
const int SoA0 = tables.soa_signs[3 * field + 0];
|
|
const int SoA1 = tables.soa_signs[3 * field + 1];
|
|
const int SoA2 = tables.soa_signs[3 * field + 2];
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
double fxx = 0.0, fxy = 0.0, fxz = 0.0;
|
|
double fyy = 0.0, fyz = 0.0, fzz = 0.0;
|
|
|
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
|
|
#if ghost_width != 3
|
|
fd_compute_second6(src, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
SoA0, SoA1, SoA2,
|
|
fxx, fxy, fxz, fyy, fyz, fzz);
|
|
#else
|
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
fxx = d_gp.Fdxdx * (
|
|
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
|
fyy = d_gp.Fdydy * (
|
|
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
|
fzz = d_gp.Fdzdz * (
|
|
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
|
|
|
const double t_jm2 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
|
|
const double t_jm1 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
|
|
const double t_jp1 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
|
|
const double t_jp2 =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
|
|
fxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
|
|
|
const double t_km2_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
|
|
const double t_km1_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
|
|
const double t_kp1_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
|
|
const double t_kp2_x =
|
|
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
|
|
fxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
|
|
|
const double t_km2_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
|
|
const double t_km1_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
|
|
const double t_kp1_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
|
|
const double t_kp2_y =
|
|
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
|
|
fyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
|
}
|
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
fxx = d_gp.Sdxdx * (
|
|
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
|
fyy = d_gp.Sdydy * (
|
|
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
|
fzz = d_gp.Sdzdz * (
|
|
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
|
fxy = d_gp.Sdxdy * (
|
|
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
|
|
fxz = d_gp.Sdxdz * (
|
|
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
|
|
fyz = d_gp.Sdydz * (
|
|
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
dst[tid] = gupxx[tid] * fxx + gupyy[tid] * fyy + gupzz[tid] * fzz
|
|
+ 2.0 * (gupxy[tid] * fxy + gupxz[tid] * fxz + gupyz[tid] * fyz);
|
|
}
|
|
|
|
static void gpu_phase10_ricci_batch(const double *gupxx,
|
|
const double *gupxy,
|
|
const double *gupxz,
|
|
const double *gupyy,
|
|
const double *gupyz,
|
|
const double *gupzz,
|
|
double *const *src_fields,
|
|
double *const *dst_fields,
|
|
const int *soa_signs,
|
|
int all)
|
|
{
|
|
Phase10RicciTables tables = {};
|
|
for (int i = 0; i < PHASE10_METRIC_FIELD_COUNT; ++i) {
|
|
tables.src_fields[i] = src_fields[i];
|
|
tables.dst_fields[i] = dst_fields[i];
|
|
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
|
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
|
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
|
}
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)PHASE10_METRIC_FIELD_COUNT);
|
|
kern_phase10_ricci_batched<<<launch_grid, BLK>>>(
|
|
gupxx, gupxy, gupxz, gupyy, gupyz, gupzz, tables);
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase14_lap_chi_derivs(const double * __restrict__ Lap,
|
|
const double * __restrict__ chi,
|
|
double * __restrict__ fxx,
|
|
double * __restrict__ fxy,
|
|
double * __restrict__ fxz,
|
|
double * __restrict__ fyy,
|
|
double * __restrict__ fyz,
|
|
double * __restrict__ fzz,
|
|
double * __restrict__ chix_out,
|
|
double * __restrict__ chiy_out,
|
|
double * __restrict__ chiz_out)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
|
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
|
return;
|
|
}
|
|
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
|
|
#if ghost_width != 3
|
|
fd_compute_second6(Lap, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
1, 1, 1,
|
|
fxx[tid], fxy[tid], fxz[tid], fyy[tid], fyz[tid], fzz[tid]);
|
|
fd_compute_first3(chi, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
1, 1, 1,
|
|
chix_out[tid], chiy_out[tid], chiz_out[tid]);
|
|
#else
|
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
{
|
|
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
|
|
fxx[tid] = d_gp.Fdxdx * (
|
|
-fetch_sym_ord2_direct(Lap, iF - 2, jF, kF, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
|
|
-30.0 * lap_c
|
|
+16.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF, 1, 1, 1));
|
|
fyy[tid] = d_gp.Fdydy * (
|
|
-fetch_sym_ord2_direct(Lap, iF, jF - 2, kF, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
|
|
-30.0 * lap_c
|
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF, 1, 1, 1));
|
|
fzz[tid] = d_gp.Fdzdz * (
|
|
-fetch_sym_ord2_direct(Lap, iF, jF, kF - 2, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
|
|
-30.0 * lap_c
|
|
+16.0 * fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF, kF + 2, 1, 1, 1));
|
|
|
|
const double t_jm2 =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF - 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 2, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 2, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 2, kF, 1, 1, 1);
|
|
const double t_jm1 =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF - 1, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF - 1, kF, 1, 1, 1);
|
|
const double t_jp1 =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF + 1, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 1, kF, 1, 1, 1);
|
|
const double t_jp2 =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF + 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF + 2, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF + 2, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF + 2, kF, 1, 1, 1);
|
|
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
|
|
|
const double t_km2_x =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 2, 1, 1, 1);
|
|
const double t_km1_x =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF - 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF - 1, 1, 1, 1);
|
|
const double t_kp1_x =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 1, 1, 1, 1);
|
|
const double t_kp2_x =
|
|
fetch_sym_ord2_direct(Lap, iF - 2, jF, kF + 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 2, jF, kF + 2, 1, 1, 1);
|
|
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
|
|
|
const double t_km2_y =
|
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 2, 1, 1, 1);
|
|
const double t_km1_y =
|
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF - 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF - 1, 1, 1, 1);
|
|
const double t_kp1_y =
|
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 1, 1, 1, 1);
|
|
const double t_kp2_y =
|
|
fetch_sym_ord2_direct(Lap, iF, jF - 2, kF + 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF + 2, kF + 2, 1, 1, 1);
|
|
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
|
|
|
chix_out[tid] = d_gp.d12dx * (
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
|
|
chiy_out[tid] = d_gp.d12dy * (
|
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
|
|
chiz_out[tid] = d_gp.d12dz * (
|
|
fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
|
|
}
|
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
{
|
|
const double lap_c = fetch_sym_ord2_direct(Lap, iF, jF, kF, 1, 1, 1);
|
|
fxx[tid] = d_gp.Sdxdx * (
|
|
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF, 1, 1, 1)
|
|
- 2.0 * lap_c
|
|
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF, 1, 1, 1));
|
|
fyy[tid] = d_gp.Sdydy * (
|
|
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF, 1, 1, 1)
|
|
- 2.0 * lap_c
|
|
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF, 1, 1, 1));
|
|
fzz[tid] = d_gp.Sdzdz * (
|
|
fetch_sym_ord2_direct(Lap, iF, jF, kF - 1, 1, 1, 1)
|
|
- 2.0 * lap_c
|
|
+ fetch_sym_ord2_direct(Lap, iF, jF, kF + 1, 1, 1, 1));
|
|
fxy[tid] = d_gp.Sdxdy * (
|
|
fetch_sym_ord2_direct(Lap, iF - 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF - 1, jF + 1, kF, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(Lap, iF + 1, jF + 1, kF, 1, 1, 1));
|
|
fxz[tid] = d_gp.Sdxdz * (
|
|
fetch_sym_ord2_direct(Lap, iF - 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF + 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF - 1, jF, kF + 1, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(Lap, iF + 1, jF, kF + 1, 1, 1, 1));
|
|
fyz[tid] = d_gp.Sdydz * (
|
|
fetch_sym_ord2_direct(Lap, iF, jF - 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF + 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(Lap, iF, jF - 1, kF + 1, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(Lap, iF, jF + 1, kF + 1, 1, 1, 1));
|
|
chix_out[tid] = d_gp.d2dx * (
|
|
-fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
|
+fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
|
|
chiy_out[tid] = d_gp.d2dy * (
|
|
-fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
|
+fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
|
|
chiz_out[tid] = d_gp.d2dz * (
|
|
-fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
|
+fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
|
|
}
|
|
else {
|
|
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
|
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
|
chix_out[tid] = 0.0; chiy_out[tid] = 0.0; chiz_out[tid] = 0.0;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/* Combined ord=3 advection + KO dissipation.
|
|
* When advection and KO use the same source field, symmetry packing is shared.
|
|
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
|
|
*/
|
|
static void gpu_lopsided_kodis(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
|
double SoA0, double SoA1, double SoA2,
|
|
double eps_val, int all)
|
|
{
|
|
#if ghost_width != 3
|
|
gpu_lopsided_kodis_single_batch(d_f_adv, d_f_ko, d_f_rhs,
|
|
d_Sfx, d_Sfy, d_Sfz,
|
|
SoA0, SoA1, SoA2, eps_val, all);
|
|
#else
|
|
double *fh = g_buf.d_fh3;
|
|
const size_t nx = (size_t)g_buf.prev_nx;
|
|
const size_t ny = (size_t)g_buf.prev_ny;
|
|
const size_t nz = (size_t)g_buf.prev_nz;
|
|
const size_t w_pack = (nx + 3ull) * (ny + 3ull) * (nz + 3ull);
|
|
|
|
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_adv, fh, SoA0, SoA1, SoA2);
|
|
kern_lopsided<<<grid(all), BLK>>>(fh, d_f_rhs, d_Sfx, d_Sfy, d_Sfz);
|
|
|
|
if (eps_val > 0.0) {
|
|
if (d_f_ko != d_f_adv) {
|
|
kern_symbd_pack_ord3<<<grid(w_pack), BLK>>>(d_f_ko, fh, SoA0, SoA1, SoA2);
|
|
}
|
|
kern_kodis<<<grid(all), BLK>>>(fh, d_f_rhs, eps_val);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
|
const double * __restrict__ Sfy,
|
|
const double * __restrict__ Sfz,
|
|
LopsidedKodisTables tables,
|
|
double eps_val)
|
|
{
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int field = blockIdx.y;
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int SoA0 = tables.soa_signs[3 * field + 0];
|
|
const int SoA1 = tables.soa_signs[3 * field + 1];
|
|
const int SoA2 = tables.soa_signs[3 * field + 2];
|
|
const double *adv_src = tables.adv_fields[field];
|
|
const double *ko_src = tables.ko_fields[field];
|
|
double *rhs = tables.rhs_fields[field];
|
|
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
|
|
#if ghost_width != 3
|
|
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
|
const double val =
|
|
fd_lopsided_axis(adv_src, iF, jF, kF, 0, Sfx[tid], iF, iminF, imaxF,
|
|
d_gp.dX, SoA0, SoA1, SoA2)
|
|
+ fd_lopsided_axis(adv_src, iF, jF, kF, 1, Sfy[tid], jF, jminF, jmaxF,
|
|
d_gp.dY, SoA0, SoA1, SoA2)
|
|
+ fd_lopsided_axis(adv_src, iF, jF, kF, 2, Sfz[tid], kF, kminF, kmaxF,
|
|
d_gp.dZ, SoA0, SoA1, SoA2);
|
|
rhs[tid] += val;
|
|
}
|
|
|
|
rhs[tid] += fd_ko_term(ko_src, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
eps_val, SoA0, SoA1, SoA2);
|
|
#else
|
|
if (i0 <= nx - 2 && j0 <= ny - 2 && k0 <= nz - 2) {
|
|
double val = 0.0;
|
|
|
|
const double sfx = Sfx[tid];
|
|
if (sfx > 0.0) {
|
|
if (i0 <= nx - 4) {
|
|
val += sfx * d_gp.d12dx * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2));
|
|
} else if (i0 <= nx - 3) {
|
|
val += sfx * d_gp.d12dx * (
|
|
fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
|
} else {
|
|
val -= sfx * d_gp.d12dx * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2));
|
|
}
|
|
} else if (sfx < 0.0) {
|
|
if ((i0 - 2) >= iminF) {
|
|
val -= sfx * d_gp.d12dx * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF - 3, jF, kF, SoA0, SoA1, SoA2));
|
|
} else if ((i0 - 1) >= iminF) {
|
|
val += sfx * d_gp.d12dx * (
|
|
fetch_sym_ord3_direct(adv_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
|
} else if (i0 >= iminF) {
|
|
val += sfx * d_gp.d12dx * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF + 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF + 3, jF, kF, SoA0, SoA1, SoA2));
|
|
}
|
|
}
|
|
|
|
const double sfy = Sfy[tid];
|
|
if (sfy > 0.0) {
|
|
if (j0 <= ny - 4) {
|
|
val += sfy * d_gp.d12dy * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2));
|
|
} else if (j0 <= ny - 3) {
|
|
val += sfy * d_gp.d12dy * (
|
|
fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
|
} else {
|
|
val -= sfy * d_gp.d12dy * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2));
|
|
}
|
|
} else if (sfy < 0.0) {
|
|
if ((j0 - 2) >= jminF) {
|
|
val -= sfy * d_gp.d12dy * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF - 3, kF, SoA0, SoA1, SoA2));
|
|
} else if ((j0 - 1) >= jminF) {
|
|
val += sfy * d_gp.d12dy * (
|
|
fetch_sym_ord3_direct(adv_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
|
} else if (j0 >= jminF) {
|
|
val += sfy * d_gp.d12dy * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF + 2, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF + 3, kF, SoA0, SoA1, SoA2));
|
|
}
|
|
}
|
|
|
|
const double sfz = Sfz[tid];
|
|
if (sfz > 0.0) {
|
|
if (k0 <= nz - 4) {
|
|
val += sfz * d_gp.d12dz * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2));
|
|
} else if (k0 <= nz - 3) {
|
|
val += sfz * d_gp.d12dz * (
|
|
fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
|
} else {
|
|
val -= sfz * d_gp.d12dz * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2));
|
|
}
|
|
} else if (sfz < 0.0) {
|
|
if ((k0 - 2) >= kminF) {
|
|
val -= sfz * d_gp.d12dz * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF, kF - 3, SoA0, SoA1, SoA2));
|
|
} else if ((k0 - 1) >= kminF) {
|
|
val += sfz * d_gp.d12dz * (
|
|
fetch_sym_ord3_direct(adv_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
- 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
+ 8.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
|
} else if (k0 >= kminF) {
|
|
val += sfz * d_gp.d12dz * (
|
|
-3.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
-10.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF, SoA0, SoA1, SoA2)
|
|
+18.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
|
- 6.0 * fetch_sym_ord3_direct(adv_src, iF, jF, kF + 2, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(adv_src, iF, jF, kF + 3, SoA0, SoA1, SoA2));
|
|
}
|
|
}
|
|
|
|
rhs[tid] += val;
|
|
}
|
|
|
|
if (eps_val > 0.0 &&
|
|
(iF - 3) >= iminF && (iF + 3) <= imaxF &&
|
|
(jF - 3) >= jminF && (jF + 3) <= jmaxF &&
|
|
(kF - 3) >= kminF && (kF + 3) <= kmaxF)
|
|
{
|
|
const double cof = 64.0;
|
|
const double Dx =
|
|
(fetch_sym_ord3_direct(ko_src, iF - 3, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF + 3, jF, kF, SoA0, SoA1, SoA2))
|
|
- 6.0 * (fetch_sym_ord3_direct(ko_src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF + 2, jF, kF, SoA0, SoA1, SoA2))
|
|
+ 15.0 * (fetch_sym_ord3_direct(ko_src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF + 1, jF, kF, SoA0, SoA1, SoA2))
|
|
- 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
|
|
const double Dy =
|
|
(fetch_sym_ord3_direct(ko_src, iF, jF - 3, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF, jF + 3, kF, SoA0, SoA1, SoA2))
|
|
- 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF, jF + 2, kF, SoA0, SoA1, SoA2))
|
|
+ 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF, jF + 1, kF, SoA0, SoA1, SoA2))
|
|
- 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
|
|
const double Dz =
|
|
(fetch_sym_ord3_direct(ko_src, iF, jF, kF - 3, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF, jF, kF + 3, SoA0, SoA1, SoA2))
|
|
- 6.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF, jF, kF + 2, SoA0, SoA1, SoA2))
|
|
+ 15.0 * (fetch_sym_ord3_direct(ko_src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
|
+ fetch_sym_ord3_direct(ko_src, iF, jF, kF + 1, SoA0, SoA1, SoA2))
|
|
- 20.0 * fetch_sym_ord3_direct(ko_src, iF, jF, kF, SoA0, SoA1, SoA2);
|
|
|
|
rhs[tid] += (eps_val / cof) * (Dx / d_gp.dX + Dy / d_gp.dY + Dz / d_gp.dZ);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
static void gpu_lopsided_kodis_single_batch(double *d_f_adv, double *d_f_ko, double *d_f_rhs,
|
|
double *d_Sfx, double *d_Sfy, double *d_Sfz,
|
|
double SoA0, double SoA1, double SoA2,
|
|
double eps_val, int all)
|
|
{
|
|
LopsidedKodisTables tables = {};
|
|
tables.adv_fields[0] = d_f_adv;
|
|
tables.ko_fields[0] = d_f_ko;
|
|
tables.rhs_fields[0] = d_f_rhs;
|
|
tables.soa_signs[0] = (int)SoA0;
|
|
tables.soa_signs[1] = (int)SoA1;
|
|
tables.soa_signs[2] = (int)SoA2;
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), 1u);
|
|
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
|
d_Sfx, d_Sfy, d_Sfz, tables, eps_val);
|
|
}
|
|
|
|
static void gpu_lopsided_kodis_state_batch(double eps_val, int all, bool include_escalar = false)
|
|
{
|
|
LopsidedKodisTables tables = {};
|
|
for (int i = 0; i < BSSN_LK_FIELD_COUNT; ++i) {
|
|
tables.adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]];
|
|
tables.ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]];
|
|
tables.rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]];
|
|
}
|
|
std::memcpy(tables.soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs));
|
|
|
|
int field_count = BSSN_LK_FIELD_COUNT;
|
|
if (include_escalar) {
|
|
tables.adv_fields[field_count] = g_buf.slot[S_Sphi];
|
|
tables.ko_fields[field_count] = g_buf.slot[S_Sphi];
|
|
tables.rhs_fields[field_count] = g_buf.slot[S_Sphi_rhs];
|
|
tables.soa_signs[3 * field_count + 0] = 1;
|
|
tables.soa_signs[3 * field_count + 1] = 1;
|
|
tables.soa_signs[3 * field_count + 2] = 1;
|
|
++field_count;
|
|
|
|
tables.adv_fields[field_count] = g_buf.slot[S_Spi];
|
|
tables.ko_fields[field_count] = g_buf.slot[S_Spi];
|
|
tables.rhs_fields[field_count] = g_buf.slot[S_Spi_rhs];
|
|
tables.soa_signs[3 * field_count + 0] = 1;
|
|
tables.soa_signs[3 * field_count + 1] = 1;
|
|
tables.soa_signs[3 * field_count + 2] = 1;
|
|
++field_count;
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
|
|
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
|
g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_escalar_sources(
|
|
const double * __restrict__ Sphi,
|
|
const double * __restrict__ Spi,
|
|
const double * __restrict__ alpn1,
|
|
const double * __restrict__ chin1,
|
|
const double * __restrict__ gxx,
|
|
const double * __restrict__ gxy,
|
|
const double * __restrict__ gxz,
|
|
const double * __restrict__ gyy,
|
|
const double * __restrict__ gyz,
|
|
const double * __restrict__ gzz,
|
|
const double * __restrict__ gupxx,
|
|
const double * __restrict__ gupxy,
|
|
const double * __restrict__ gupxz,
|
|
const double * __restrict__ gupyy,
|
|
const double * __restrict__ gupyz,
|
|
const double * __restrict__ gupzz,
|
|
const double * __restrict__ chix,
|
|
const double * __restrict__ chiy,
|
|
const double * __restrict__ chiz,
|
|
const double * __restrict__ Lapx,
|
|
const double * __restrict__ Lapy,
|
|
const double * __restrict__ Lapz,
|
|
const double * __restrict__ trK,
|
|
const double * __restrict__ Gamx,
|
|
const double * __restrict__ Gamy,
|
|
const double * __restrict__ Gamz,
|
|
const double * __restrict__ Kx,
|
|
const double * __restrict__ Ky,
|
|
const double * __restrict__ Kz,
|
|
const double * __restrict__ fxx,
|
|
const double * __restrict__ fxy,
|
|
const double * __restrict__ fxz,
|
|
const double * __restrict__ fyy,
|
|
const double * __restrict__ fyz,
|
|
const double * __restrict__ fzz,
|
|
double * __restrict__ Sphi_rhs,
|
|
double * __restrict__ Spi_rhs,
|
|
double * __restrict__ rho,
|
|
double * __restrict__ Sx,
|
|
double * __restrict__ Sy,
|
|
double * __restrict__ Sz,
|
|
double * __restrict__ Sxx,
|
|
double * __restrict__ Sxy,
|
|
double * __restrict__ Sxz,
|
|
double * __restrict__ Syy,
|
|
double * __restrict__ Syz,
|
|
double * __restrict__ Szz,
|
|
double escalar_a2)
|
|
{
|
|
constexpr double PI_V = 3.141592653589793238462643383279502884;
|
|
constexpr double TWO = 2.0;
|
|
constexpr double HALF = 0.5;
|
|
const double A2 = escalar_a2;
|
|
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < d_gp.all;
|
|
i += blockDim.x * gridDim.x) {
|
|
const double c1 = chin1[i];
|
|
const double a = alpn1[i];
|
|
const double sx = Kx[i];
|
|
const double sy = Ky[i];
|
|
const double sz = Kz[i];
|
|
const double sp = Spi[i];
|
|
|
|
const double uxx = gupxx[i];
|
|
const double uxy = gupxy[i];
|
|
const double uxz = gupxz[i];
|
|
const double uyy = gupyy[i];
|
|
const double uyz = gupyz[i];
|
|
const double uzz = gupzz[i];
|
|
|
|
const double sqpi3 = sqrt(PI_V / 3.0);
|
|
const double e4 = exp(4.0 * sqpi3 * Sphi[i]);
|
|
const double em8 = exp(-8.0 * sqpi3 * Sphi[i]);
|
|
const double V = em8 * (1.0 - e4) * (1.0 - e4) / (32.0 * PI_V * A2);
|
|
const double dV = (1.0 / A2 / 12.0) * sqrt(3.0 / PI_V) * em8 * (-1.0 + e4);
|
|
|
|
Sphi_rhs[i] = a * sp;
|
|
|
|
double pi_rhs = uxx * fxx[i] + uyy * fyy[i] + uzz * fzz[i]
|
|
+ TWO * (uxy * fxy[i] + uxz * fxz[i] + uyz * fyz[i]);
|
|
pi_rhs -= (Gamx[i] + (uxx * chix[i] + uxy * chiy[i] + uxz * chiz[i]) / (TWO * c1)) * sx
|
|
+ (Gamy[i] + (uxy * chix[i] + uyy * chiy[i] + uyz * chiz[i]) / (TWO * c1)) * sy
|
|
+ (Gamz[i] + (uxz * chix[i] + uyz * chiy[i] + uzz * chiz[i]) / (TWO * c1)) * sz;
|
|
pi_rhs = pi_rhs * a
|
|
+ (uxx * Lapx[i] * sx + uxy * Lapx[i] * sy + uxz * Lapx[i] * sz
|
|
+ uxy * Lapy[i] * sx + uyy * Lapy[i] * sy + uyz * Lapy[i] * sz
|
|
+ uxz * Lapz[i] * sx + uyz * Lapz[i] * sy + uzz * Lapz[i] * sz);
|
|
Spi_rhs[i] = pi_rhs * c1 + a * (trK[i] * sp - dV);
|
|
|
|
const double grad = HALF * (uxx * sx * sx + uyy * sy * sy + uzz * sz * sz)
|
|
+ uxy * sx * sy + uxz * sx * sz + uyz * sy * sz;
|
|
const double rho_v = c1 * grad + HALF * sp * sp + V;
|
|
rho[i] = rho_v;
|
|
Sx[i] = -sp * sx;
|
|
Sy[i] = -sp * sy;
|
|
Sz[i] = -sp * sz;
|
|
const double f = (rho_v - sp * sp) / c1;
|
|
Sxx[i] = sx * sx - f * gxx[i];
|
|
Sxy[i] = sx * sy - f * gxy[i];
|
|
Sxz[i] = sx * sz - f * gxz[i];
|
|
Syy[i] = sy * sy - f * gyy[i];
|
|
Syz[i] = sy * sz - f * gyz[i];
|
|
Szz[i] = sz * sz - f * gzz[i];
|
|
}
|
|
}
|
|
|
|
static void gpu_escalar_sources(int all, double escalar_a2)
|
|
{
|
|
#define D(s) g_buf.slot[s]
|
|
gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all);
|
|
gpu_fdderivs(D(S_Sphi), D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz),
|
|
D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz), 1.0, 1.0, 1.0, all);
|
|
|
|
kern_escalar_sources<<<grid((size_t)all), BLK>>>(
|
|
D(S_Sphi), D(S_Spi),
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_trK), D(S_Gamx), D(S_Gamy), D(S_Gamz),
|
|
D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z),
|
|
D(S_Sphi_xx), D(S_Sphi_xy), D(S_Sphi_xz),
|
|
D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz),
|
|
D(S_Sphi_rhs), D(S_Spi_rhs),
|
|
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
|
|
escalar_a2);
|
|
#undef D
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 3)
|
|
void kern_em_rhs_sources(
|
|
const double * __restrict__ chi,
|
|
const double * __restrict__ dxx, const double * __restrict__ dxy, const double * __restrict__ dxz,
|
|
const double * __restrict__ dyy, const double * __restrict__ dyz, const double * __restrict__ dzz,
|
|
const double * __restrict__ Lap,
|
|
const double * __restrict__ betax, const double * __restrict__ betay, const double * __restrict__ betaz,
|
|
const double * __restrict__ trK,
|
|
const double * __restrict__ Ex, const double * __restrict__ Ey, const double * __restrict__ Ez,
|
|
const double * __restrict__ Bx, const double * __restrict__ By, const double * __restrict__ Bz,
|
|
const double * __restrict__ Kpsi, const double * __restrict__ Kphi,
|
|
const double * __restrict__ Jx, const double * __restrict__ Jy, const double * __restrict__ Jz,
|
|
const double * __restrict__ qchar,
|
|
const double * __restrict__ chix, const double * __restrict__ chiy, const double * __restrict__ chiz,
|
|
const double * __restrict__ gxxx, const double * __restrict__ gxyx, const double * __restrict__ gxzx,
|
|
const double * __restrict__ gyyx, const double * __restrict__ gyzx, const double * __restrict__ gzzx,
|
|
const double * __restrict__ gxxy, const double * __restrict__ gxyy, const double * __restrict__ gxzy,
|
|
const double * __restrict__ gyyy, const double * __restrict__ gyzy, const double * __restrict__ gzzy,
|
|
const double * __restrict__ gxxz, const double * __restrict__ gxyz, const double * __restrict__ gxzz,
|
|
const double * __restrict__ gyyz, const double * __restrict__ gyzz, const double * __restrict__ gzzz,
|
|
const double * __restrict__ Lapx, const double * __restrict__ Lapy, const double * __restrict__ Lapz,
|
|
const double * __restrict__ betaxx, const double * __restrict__ betaxy, const double * __restrict__ betaxz,
|
|
const double * __restrict__ betayx, const double * __restrict__ betayy, const double * __restrict__ betayz,
|
|
const double * __restrict__ betazx, const double * __restrict__ betazy, const double * __restrict__ betazz,
|
|
const double * __restrict__ Kpsix, const double * __restrict__ Kpsiy, const double * __restrict__ Kpsiz,
|
|
const double * __restrict__ Kphix, const double * __restrict__ Kphiy, const double * __restrict__ Kphiz,
|
|
const double * __restrict__ Exx, const double * __restrict__ Exy, const double * __restrict__ Exz,
|
|
const double * __restrict__ Eyx, const double * __restrict__ Eyy, const double * __restrict__ Eyz,
|
|
const double * __restrict__ Ezx, const double * __restrict__ Ezy, const double * __restrict__ Ezz,
|
|
const double * __restrict__ Bxx, const double * __restrict__ Bxy, const double * __restrict__ Bxz,
|
|
const double * __restrict__ Byx, const double * __restrict__ Byy, const double * __restrict__ Byz,
|
|
const double * __restrict__ Bzx, const double * __restrict__ Bzy, const double * __restrict__ Bzz,
|
|
double * __restrict__ Kpsi_rhs, double * __restrict__ Kphi_rhs,
|
|
double * __restrict__ Ex_rhs, double * __restrict__ Ey_rhs, double * __restrict__ Ez_rhs,
|
|
double * __restrict__ Bx_rhs, double * __restrict__ By_rhs, double * __restrict__ Bz_rhs,
|
|
double * __restrict__ rho, double * __restrict__ Sx, double * __restrict__ Sy, double * __restrict__ Sz,
|
|
double * __restrict__ Sxx, double * __restrict__ Sxy, double * __restrict__ Sxz,
|
|
double * __restrict__ Syy, double * __restrict__ Syz, double * __restrict__ Szz)
|
|
{
|
|
const int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (i >= d_gp.all) return;
|
|
|
|
constexpr double ONE = 1.0, TWO = 2.0, FOUR = 4.0, F3o2 = 1.5, EIT = 8.0, kappa = 1.0;
|
|
constexpr double PI = 3.141592653589793238462643383279502884;
|
|
const double alpn1 = Lap[i] + ONE;
|
|
const double chin1 = chi[i] + ONE;
|
|
const double sqc = sqrt(chin1);
|
|
const double chi3o2 = sqc * sqc * sqc;
|
|
const double gxxv = (dxx[i] + ONE) / chin1;
|
|
const double gxyv = dxy[i] / chin1;
|
|
const double gxzv = dxz[i] / chin1;
|
|
const double gyyv = (dyy[i] + ONE) / chin1;
|
|
const double gyzv = dyz[i] / chin1;
|
|
const double gzzv = (dzz[i] + ONE) / chin1;
|
|
const double gxxxv = (gxxx[i] - gxxv * chix[i]) / chin1;
|
|
const double gxxyv = (gxxy[i] - gxxv * chiy[i]) / chin1;
|
|
const double gxxzv = (gxxz[i] - gxxv * chiz[i]) / chin1;
|
|
const double gxyxv = (gxyx[i] - gxyv * chix[i]) / chin1;
|
|
const double gxyyv = (gxyy[i] - gxyv * chiy[i]) / chin1;
|
|
const double gxyzv = (gxyz[i] - gxyv * chiz[i]) / chin1;
|
|
const double gxzxv = (gxzx[i] - gxzv * chix[i]) / chin1;
|
|
const double gxzyv = (gxzy[i] - gxzv * chiy[i]) / chin1;
|
|
const double gxzzv = (gxzz[i] - gxzv * chiz[i]) / chin1;
|
|
const double gyyxv = (gyyx[i] - gyyv * chix[i]) / chin1;
|
|
const double gyyyv = (gyyy[i] - gyyv * chiy[i]) / chin1;
|
|
const double gyyzv = (gyyz[i] - gyyv * chiz[i]) / chin1;
|
|
const double gyzxv = (gyzx[i] - gyzv * chix[i]) / chin1;
|
|
const double gyzyv = (gyzy[i] - gyzv * chiy[i]) / chin1;
|
|
const double gyzzv = (gyzz[i] - gyzv * chiz[i]) / chin1;
|
|
const double gzzxv = (gzzx[i] - gzzv * chix[i]) / chin1;
|
|
const double gzzyv = (gzzy[i] - gzzv * chiy[i]) / chin1;
|
|
const double gzzzv = (gzzz[i] - gzzv * chiz[i]) / chin1;
|
|
const double det = gxxv*gyyv*gzzv + gxyv*gyzv*gxzv + gxzv*gxyv*gyzv - gxzv*gyyv*gxzv - gxyv*gxyv*gzzv - gxxv*gyzv*gyzv;
|
|
const double gupxx = (gyyv*gzzv - gyzv*gyzv) / det;
|
|
const double gupxy = -(gxyv*gzzv - gyzv*gxzv) / det;
|
|
const double gupxz = (gxyv*gyzv - gyyv*gxzv) / det;
|
|
const double gupyy = (gxxv*gzzv - gxzv*gxzv) / det;
|
|
const double gupyz = -(gxxv*gyzv - gxyv*gxzv) / det;
|
|
const double gupzz = (gxxv*gyyv - gxyv*gxyv) / det;
|
|
|
|
Ex_rhs[i] = alpn1*trK[i]*Ex[i]-(Ex[i]*betaxx[i]+Ey[i]*betaxy[i]+Ez[i]*betaxz[i])-FOUR*PI*alpn1*Jx[i]-alpn1*(gupxx*Kpsix[i]+gupxy*Kpsiy[i]+gupxz*Kpsiz[i])
|
|
+ chi3o2*(((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapy[i]+alpn1*(gxzv*Bxy[i]+gyzv*Byy[i]+gzzv*Bzy[i])+alpn1*(Bx[i]*gxzyv+By[i]*gyzyv+Bz[i]*gzzyv))-((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapz[i]+alpn1*(gxyv*Bxz[i]+gyyv*Byz[i]+gyzv*Bzz[i])+alpn1*(Bx[i]*gxyzv+By[i]*gyyzv+Bz[i]*gyzzv)));
|
|
Ey_rhs[i] = alpn1*trK[i]*Ey[i]-(Ex[i]*betayx[i]+Ey[i]*betayy[i]+Ez[i]*betayz[i])-FOUR*PI*alpn1*Jy[i]-alpn1*(gupxy*Kpsix[i]+gupyy*Kpsiy[i]+gupyz*Kpsiz[i])
|
|
+ chi3o2*(((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapz[i]+alpn1*(gxxv*Bxz[i]+gxyv*Byz[i]+gxzv*Bzz[i])+alpn1*(Bx[i]*gxxzv+By[i]*gxyzv+Bz[i]*gxzzv))-((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapx[i]+alpn1*(gxzv*Bxx[i]+gyzv*Byx[i]+gzzv*Bzx[i])+alpn1*(Bx[i]*gxzxv+By[i]*gyzxv+Bz[i]*gzzxv)));
|
|
Ez_rhs[i] = alpn1*trK[i]*Ez[i]-(Ex[i]*betazx[i]+Ey[i]*betazy[i]+Ez[i]*betazz[i])-FOUR*PI*alpn1*Jz[i]-alpn1*(gupxz*Kpsix[i]+gupyz*Kpsiy[i]+gupzz*Kpsiz[i])
|
|
+ chi3o2*(((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapx[i]+alpn1*(gxyv*Bxx[i]+gyyv*Byx[i]+gyzv*Bzx[i])+alpn1*(Bx[i]*gxyxv+By[i]*gyyxv+Bz[i]*gyzxv))-((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapy[i]+alpn1*(gxxv*Bxy[i]+gxyv*Byy[i]+gxzv*Bzy[i])+alpn1*(Bx[i]*gxxyv+By[i]*gxyyv+Bz[i]*gxzyv)));
|
|
Bx_rhs[i] = alpn1*trK[i]*Bx[i]-(Bx[i]*betaxx[i]+By[i]*betaxy[i]+Bz[i]*betaxz[i])-alpn1*(gupxx*Kphix[i]+gupxy*Kphiy[i]+gupxz*Kphiz[i])
|
|
- chi3o2*(((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapy[i]+alpn1*(gxzv*Exy[i]+gyzv*Eyy[i]+gzzv*Ezy[i])+alpn1*(Ex[i]*gxzyv+Ey[i]*gyzyv+Ez[i]*gzzyv))-((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapz[i]+alpn1*(gxyv*Exz[i]+gyyv*Eyz[i]+gyzv*Ezz[i])+alpn1*(Ex[i]*gxyzv+Ey[i]*gyyzv+Ez[i]*gyzzv)));
|
|
By_rhs[i] = alpn1*trK[i]*By[i]-(Bx[i]*betayx[i]+By[i]*betayy[i]+Bz[i]*betayz[i])-alpn1*(gupxy*Kphix[i]+gupyy*Kphiy[i]+gupyz*Kphiz[i])
|
|
- chi3o2*(((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapz[i]+alpn1*(gxxv*Exz[i]+gxyv*Eyz[i]+gxzv*Ezz[i])+alpn1*(Ex[i]*gxxzv+Ey[i]*gxyzv+Ez[i]*gxzzv))-((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapx[i]+alpn1*(gxzv*Exx[i]+gyzv*Eyx[i]+gzzv*Ezx[i])+alpn1*(Ex[i]*gxzxv+Ey[i]*gyzxv+Ez[i]*gzzxv)));
|
|
Bz_rhs[i] = alpn1*trK[i]*Bz[i]-(Bx[i]*betazx[i]+By[i]*betazy[i]+Bz[i]*betazz[i])-alpn1*(gupxz*Kphix[i]+gupyz*Kphiy[i]+gupzz*Kphiz[i])
|
|
- chi3o2*(((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapx[i]+alpn1*(gxyv*Exx[i]+gyyv*Eyx[i]+gyzv*Ezx[i])+alpn1*(Ex[i]*gxyxv+Ey[i]*gyyxv+Ez[i]*gyzxv))-((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapy[i]+alpn1*(gxxv*Exy[i]+gxyv*Eyy[i]+gxzv*Ezy[i])+alpn1*(Ex[i]*gxxyv+Ey[i]*gxyyv+Ez[i]*gxzyv)));
|
|
Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i]-alpn1*kappa*Kpsi[i]-alpn1*(Exx[i]+Eyy[i]+Ezz[i]-F3o2/chin1*(chix[i]*Ex[i]+chiy[i]*Ey[i]+chiz[i]*Ez[i]));
|
|
Kphi_rhs[i] = -alpn1*kappa*Kphi[i]-alpn1*(Bxx[i]+Byy[i]+Bzz[i]-F3o2/chin1*(chix[i]*Bx[i]+chiy[i]*By[i]+chiz[i]*Bz[i]));
|
|
|
|
const double lrho = (gxxv*(Ex[i]*Ex[i]+Bx[i]*Bx[i])+gyyv*(Ey[i]*Ey[i]+By[i]*By[i])+gzzv*(Ez[i]*Ez[i]+Bz[i]*Bz[i])+TWO*(gxyv*(Ex[i]*Ey[i]+Bx[i]*By[i])+gxzv*(Ex[i]*Ez[i]+Bx[i]*Bz[i])+gyzv*(Ey[i]*Ez[i]+By[i]*Bz[i])))/EIT/PI;
|
|
rho[i] = lrho;
|
|
Sx[i] = (Ey[i]*Bz[i]-Ez[i]*By[i])/FOUR/PI/chi3o2;
|
|
Sy[i] = (Ez[i]*Bx[i]-Ex[i]*Bz[i])/FOUR/PI/chi3o2;
|
|
Sz[i] = (Ex[i]*By[i]-Ey[i]*Bx[i])/FOUR/PI/chi3o2;
|
|
const double lEx = gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i], lEy = gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i], lEz = gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i];
|
|
const double lBx = gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i], lBy = gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i], lBz = gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i];
|
|
Sxx[i] = lrho*gxxv-(lEx*lEx+lBx*lBx)/FOUR/PI;
|
|
Sxy[i] = lrho*gxyv-(lEx*lEy+lBx*lBy)/FOUR/PI;
|
|
Sxz[i] = lrho*gxzv-(lEx*lEz+lBx*lBz)/FOUR/PI;
|
|
Syy[i] = lrho*gyyv-(lEy*lEy+lBy*lBy)/FOUR/PI;
|
|
Syz[i] = lrho*gyzv-(lEy*lEz+lBy*lBz)/FOUR/PI;
|
|
Szz[i] = lrho*gzzv-(lEz*lEz+lBz*lBz)/FOUR/PI;
|
|
}
|
|
|
|
static void gpu_lopsided_kodis_em_batch(double eps_val, int all)
|
|
{
|
|
LopsidedKodisTables tables = {};
|
|
const int adv_slots[8] = {S_EM_Kpsi, S_EM_Kphi, S_EM_Ex, S_EM_Ey,
|
|
S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz};
|
|
const int rhs_slots[8] = {S_EM_Kpsi_rhs, S_EM_Kphi_rhs, S_EM_Ex_rhs, S_EM_Ey_rhs,
|
|
S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs};
|
|
const int signs[24] = {
|
|
1, 1, 1, 1, 1, 1,
|
|
-1, 1, 1, 1,-1, 1, 1, 1,-1,
|
|
1,-1,-1, -1, 1,-1, -1,-1, 1
|
|
};
|
|
for (int i = 0; i < 8; ++i) {
|
|
tables.adv_fields[i] = g_buf.slot[adv_slots[i]];
|
|
tables.ko_fields[i] = g_buf.slot[adv_slots[i]];
|
|
tables.rhs_fields[i] = g_buf.slot[rhs_slots[i]];
|
|
tables.soa_signs[3*i+0] = signs[3*i+0];
|
|
tables.soa_signs[3*i+1] = signs[3*i+1];
|
|
tables.soa_signs[3*i+2] = signs[3*i+2];
|
|
}
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), 8u);
|
|
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
|
g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
|
|
}
|
|
|
|
static void gpu_em_rhs_sources(int all, double eps)
|
|
{
|
|
#define D(s) g_buf.slot[s]
|
|
double *src[] = {D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
|
|
D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz),
|
|
D(S_EM_Kpsi), D(S_EM_Kphi),
|
|
D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz)};
|
|
double *fx[] = {D(S_Lapx), D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_EM_Kpsix), D(S_EM_Kphix),
|
|
D(S_EM_Exx), D(S_EM_Eyx), D(S_EM_Ezx), D(S_EM_Bxx), D(S_EM_Byx), D(S_EM_Bzx)};
|
|
double *fy[] = {D(S_Lapy), D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_EM_Kpsiy), D(S_EM_Kphiy),
|
|
D(S_EM_Exy), D(S_EM_Eyy), D(S_EM_Ezy), D(S_EM_Bxy), D(S_EM_Byy), D(S_EM_Bzy)};
|
|
double *fz[] = {D(S_Lapz), D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_EM_Kpsiz), D(S_EM_Kphiz),
|
|
D(S_EM_Exz), D(S_EM_Eyz), D(S_EM_Ezz), D(S_EM_Bxz), D(S_EM_Byz), D(S_EM_Bzz)};
|
|
const int soa[] = {
|
|
1,1,1, -1,1,1, 1,-1,1, 1,1,-1, 1,1,1,
|
|
1,1,1, -1,-1,1, -1,1,-1, 1,1,1, 1,-1,-1, 1,1,1,
|
|
1,1,1, 1,1,1,
|
|
-1,1,1, 1,-1,1, 1,1,-1, 1,-1,-1, -1,1,-1, -1,-1,1
|
|
};
|
|
gpu_fderivs_batch(19, src, fx, fy, fz, soa, all);
|
|
kern_em_rhs_sources<<<grid((size_t)all), BLK>>>(
|
|
D(S_chi), D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz),
|
|
D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_trK),
|
|
D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz),
|
|
D(S_EM_Kpsi), D(S_EM_Kphi), D(S_EM_Jx), D(S_EM_Jy), D(S_EM_Jz), D(S_EM_qchar),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz), D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_EM_Kpsix), D(S_EM_Kpsiy), D(S_EM_Kpsiz), D(S_EM_Kphix), D(S_EM_Kphiy), D(S_EM_Kphiz),
|
|
D(S_EM_Exx), D(S_EM_Exy), D(S_EM_Exz), D(S_EM_Eyx), D(S_EM_Eyy), D(S_EM_Eyz),
|
|
D(S_EM_Ezx), D(S_EM_Ezy), D(S_EM_Ezz), D(S_EM_Bxx), D(S_EM_Bxy), D(S_EM_Bxz),
|
|
D(S_EM_Byx), D(S_EM_Byy), D(S_EM_Byz), D(S_EM_Bzx), D(S_EM_Bzy), D(S_EM_Bzz),
|
|
D(S_EM_Kpsi_rhs), D(S_EM_Kphi_rhs), D(S_EM_Ex_rhs), D(S_EM_Ey_rhs), D(S_EM_Ez_rhs),
|
|
D(S_EM_Bx_rhs), D(S_EM_By_rhs), D(S_EM_Bz_rhs),
|
|
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
|
|
gpu_lopsided_kodis_em_batch(eps, all);
|
|
#undef D
|
|
}
|
|
|
|
__global__ void kern_rk4_finalize(const double * __restrict__ f0,
|
|
double * __restrict__ frhs,
|
|
double * __restrict__ accum,
|
|
double dT,
|
|
int rk4_stage)
|
|
{
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < d_gp.all;
|
|
i += blockDim.x * gridDim.x)
|
|
{
|
|
const double rhs = frhs[i];
|
|
switch (rk4_stage) {
|
|
case 0:
|
|
accum[i] = rhs;
|
|
frhs[i] = f0[i] + 0.5 * dT * rhs;
|
|
break;
|
|
case 1:
|
|
accum[i] += 2.0 * rhs;
|
|
frhs[i] = f0[i] + 0.5 * dT * rhs;
|
|
break;
|
|
case 2:
|
|
accum[i] += 2.0 * rhs;
|
|
frhs[i] = f0[i] + dT * rhs;
|
|
break;
|
|
default:
|
|
frhs[i] = f0[i] + (dT / 6.0) * (accum[i] + rhs);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_rk4_finalize_batched(Rk4FinalizeTables tables,
|
|
double dT,
|
|
int rk4_stage,
|
|
double chitiny)
|
|
{
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int field = blockIdx.y;
|
|
const double *f0 = tables.f0_fields[field];
|
|
double *frhs = tables.rhs_fields[field];
|
|
double *accum = tables.accum_fields[field];
|
|
|
|
const double rhs = frhs[tid];
|
|
switch (rk4_stage) {
|
|
case 0:
|
|
accum[tid] = rhs;
|
|
frhs[tid] = f0[tid] + 0.5 * dT * rhs;
|
|
break;
|
|
case 1:
|
|
accum[tid] += 2.0 * rhs;
|
|
frhs[tid] = f0[tid] + 0.5 * dT * rhs;
|
|
break;
|
|
case 2:
|
|
accum[tid] += 2.0 * rhs;
|
|
frhs[tid] = f0[tid] + dT * rhs;
|
|
break;
|
|
default:
|
|
frhs[tid] = f0[tid] + (dT / 6.0) * (accum[tid] + rhs);
|
|
break;
|
|
}
|
|
|
|
if (field == 0 && frhs[tid] < chitiny) frhs[tid] = chitiny;
|
|
}
|
|
|
|
static void gpu_rk4_finalize_batch(const StepContext &ctx,
|
|
size_t all,
|
|
double dT,
|
|
int rk4_stage,
|
|
double chitiny)
|
|
{
|
|
Rk4FinalizeTables tables = {};
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
tables.f0_fields[i] = ctx.d_state0[i];
|
|
tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
|
tables.accum_fields[i] = ctx.d_accum[i];
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
|
|
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
|
}
|
|
|
|
static void gpu_escalar_rk4_finalize_batch(const StepContext &ctx,
|
|
size_t all,
|
|
double dT,
|
|
int rk4_stage,
|
|
double chitiny)
|
|
{
|
|
Rk4FinalizeTables tables = {};
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
tables.f0_fields[i] = ctx.d_state0[i];
|
|
tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
|
tables.accum_fields[i] = ctx.d_accum[i];
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
|
|
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
|
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[24], g_buf.slot[S_Sphi_rhs],
|
|
ctx.d_accum[24], dT, rk4_stage);
|
|
kern_rk4_finalize<<<grid(all), BLK>>>(ctx.d_state0[25], g_buf.slot[S_Spi_rhs],
|
|
ctx.d_accum[25], dT, rk4_stage);
|
|
}
|
|
|
|
static void gpu_em_rk4_finalize_batch(const StepContext &ctx,
|
|
size_t all,
|
|
double dT,
|
|
int rk4_stage,
|
|
double chitiny)
|
|
{
|
|
Rk4FinalizeTables tables = {};
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
tables.f0_fields[i] = ctx.d_state0[i];
|
|
tables.rhs_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]];
|
|
tables.accum_fields[i] = ctx.d_accum[i];
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_EM_STATE_COUNT);
|
|
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
|
|
int touch_xmin, int touch_xmax,
|
|
int touch_ymin, int touch_ymax,
|
|
int touch_zmin, int touch_zmax)
|
|
{
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int nx = d_gp.ex[0];
|
|
const int ny = d_gp.ex[1];
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
const bool on_boundary =
|
|
(touch_xmin && i0 == 0) ||
|
|
(touch_xmax && i0 == nx - 1) ||
|
|
(touch_ymin && j0 == 0) ||
|
|
(touch_ymax && j0 == ny - 1) ||
|
|
(touch_zmin && k0 == 0) ||
|
|
(touch_zmax && k0 == d_gp.ex[2] - 1);
|
|
if (!on_boundary) return;
|
|
|
|
const int field = blockIdx.y;
|
|
tables.dst_fields[field][tid] = tables.src_fields[field][tid];
|
|
}
|
|
|
|
static void gpu_copy_patch_boundary_batch(int all,
|
|
int touch_xmin, int touch_xmax,
|
|
int touch_ymin, int touch_ymax,
|
|
int touch_zmin, int touch_zmax)
|
|
{
|
|
if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax))
|
|
return;
|
|
|
|
PatchBoundaryTables tables = {};
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
tables.src_fields[i] = g_buf.slot[k_state_input_slots[i]];
|
|
tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT);
|
|
kern_copy_patch_boundary_batched<<<launch_grid, BLK>>>(
|
|
tables,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_escalar_restore_patch_boundary_batched(EScalarBoundaryTables tables,
|
|
int touch_xmin, int touch_xmax,
|
|
int touch_ymin, int touch_ymax,
|
|
int touch_zmin, int touch_zmax)
|
|
{
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int nx = d_gp.ex[0];
|
|
const int ny = d_gp.ex[1];
|
|
const int nz = d_gp.ex[2];
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
const bool on_boundary =
|
|
(touch_xmin && i0 == 0) ||
|
|
(touch_xmax && i0 == nx - 1) ||
|
|
(touch_ymin && j0 == 0) ||
|
|
(touch_ymax && j0 == ny - 1) ||
|
|
(touch_zmin && k0 == 0) ||
|
|
(touch_zmax && k0 == nz - 1);
|
|
if (!on_boundary) return;
|
|
|
|
const int field = blockIdx.y;
|
|
tables.out_fields[field][tid] = tables.f0_fields[field][tid];
|
|
}
|
|
|
|
static void gpu_escalar_restore_patch_boundary_batch(const StepContext &ctx,
|
|
int all,
|
|
int touch_xmin, int touch_xmax,
|
|
int touch_ymin, int touch_ymax,
|
|
int touch_zmin, int touch_zmax)
|
|
{
|
|
if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax))
|
|
return;
|
|
|
|
EScalarBoundaryTables tables = {};
|
|
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
|
tables.f0_fields[i] = ctx.d_state0[i];
|
|
tables.out_fields[i] = g_buf.slot[k_escalar_state_rhs_slots[i]];
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_ESCALAR_STATE_COUNT);
|
|
kern_escalar_restore_patch_boundary_batched<<<launch_grid, BLK>>>(
|
|
tables,
|
|
touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
|
|
}
|
|
|
|
static void gpu_em_restore_patch_boundary_batch(const StepContext &ctx,
|
|
int all,
|
|
int touch_xmin, int touch_xmax,
|
|
int touch_ymin, int touch_ymax,
|
|
int touch_zmin, int touch_zmax)
|
|
{
|
|
if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax))
|
|
return;
|
|
|
|
EScalarBoundaryTables tables = {};
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
tables.f0_fields[i] = ctx.d_state0[i];
|
|
tables.out_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]];
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_EM_STATE_COUNT);
|
|
kern_escalar_restore_patch_boundary_batched<<<launch_grid, BLK>>>(
|
|
tables,
|
|
touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
|
|
}
|
|
|
|
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
|
double * __restrict__ gxy,
|
|
double * __restrict__ gxz,
|
|
double * __restrict__ dyy,
|
|
double * __restrict__ gyz,
|
|
double * __restrict__ dzz,
|
|
double * __restrict__ Axx,
|
|
double * __restrict__ Axy,
|
|
double * __restrict__ Axz,
|
|
double * __restrict__ Ayy,
|
|
double * __restrict__ Ayz,
|
|
double * __restrict__ Azz)
|
|
{
|
|
constexpr double F1O3 = 1.0 / 3.0;
|
|
constexpr double ONE = 1.0;
|
|
constexpr double TWO = 2.0;
|
|
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < d_gp.all;
|
|
i += blockDim.x * gridDim.x)
|
|
{
|
|
double lgxx = dxx[i] + ONE;
|
|
double lgyy = dyy[i] + ONE;
|
|
double lgzz = dzz[i] + ONE;
|
|
double lgxy = gxy[i];
|
|
double lgxz = gxz[i];
|
|
double lgyz = gyz[i];
|
|
|
|
double lscale = lgxx * lgyy * lgzz
|
|
+ lgxy * lgyz * lgxz
|
|
+ lgxz * lgxy * lgyz
|
|
- lgxz * lgyy * lgxz
|
|
- lgxy * lgxy * lgzz
|
|
- lgxx * lgyz * lgyz;
|
|
|
|
lscale = ONE / cbrt(lscale);
|
|
|
|
lgxx *= lscale;
|
|
lgxy *= lscale;
|
|
lgxz *= lscale;
|
|
lgyy *= lscale;
|
|
lgyz *= lscale;
|
|
lgzz *= lscale;
|
|
|
|
dxx[i] = lgxx - ONE;
|
|
gxy[i] = lgxy;
|
|
gxz[i] = lgxz;
|
|
dyy[i] = lgyy - ONE;
|
|
gyz[i] = lgyz;
|
|
dzz[i] = lgzz - ONE;
|
|
|
|
const double lgupxx = (lgyy * lgzz - lgyz * lgyz);
|
|
const double lgupxy = - (lgxy * lgzz - lgyz * lgxz);
|
|
const double lgupxz = (lgxy * lgyz - lgyy * lgxz);
|
|
const double lgupyy = (lgxx * lgzz - lgxz * lgxz);
|
|
const double lgupyz = - (lgxx * lgyz - lgxy * lgxz);
|
|
const double lgupzz = (lgxx * lgyy - lgxy * lgxy);
|
|
|
|
const double ltrA = lgupxx * Axx[i] + lgupyy * Ayy[i] + lgupzz * Azz[i]
|
|
+ TWO * (lgupxy * Axy[i] + lgupxz * Axz[i] + lgupyz * Ayz[i]);
|
|
|
|
Axx[i] -= F1O3 * lgxx * ltrA;
|
|
Axy[i] -= F1O3 * lgxy * ltrA;
|
|
Axz[i] -= F1O3 * lgxz * ltrA;
|
|
Ayy[i] -= F1O3 * lgyy * ltrA;
|
|
Ayz[i] -= F1O3 * lgyz * ltrA;
|
|
Azz[i] -= F1O3 * lgzz * ltrA;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_lowerboundset_cuda(double * __restrict__ chi, double tinny)
|
|
{
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < d_gp.all;
|
|
i += blockDim.x * gridDim.x)
|
|
{
|
|
if (chi[i] < tinny) chi[i] = tinny;
|
|
}
|
|
}
|
|
|
|
enum SommerFace {
|
|
FACE_XMAX = 0,
|
|
FACE_YMAX = 1,
|
|
FACE_ZMAX = 2,
|
|
FACE_XMIN = 3,
|
|
FACE_YMIN = 4,
|
|
FACE_ZMIN = 5
|
|
};
|
|
|
|
__global__ void kern_sommerfeld_face_bam(const double * __restrict__ fh,
|
|
double * __restrict__ f_rhs,
|
|
int face,
|
|
double velocity,
|
|
double x0,
|
|
double y0,
|
|
double z0)
|
|
{
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imin = (d_gp.Symmetry > 1 && fabs(x0) < d_gp.dX) ? 0 : 1;
|
|
const int jmin = (d_gp.Symmetry > 1 && fabs(y0) < d_gp.dY) ? 0 : 1;
|
|
const int kmin = (d_gp.Symmetry > 0 && fabs(z0) < d_gp.dZ) ? 0 : 1;
|
|
const int imax = nx, jmax = ny, kmax = nz;
|
|
const int plane_count =
|
|
(face == FACE_XMAX || face == FACE_XMIN) ? ny * nz :
|
|
(face == FACE_YMAX || face == FACE_YMIN) ? nx * nz :
|
|
nx * ny;
|
|
|
|
for (int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
tid < plane_count;
|
|
tid += blockDim.x * gridDim.x)
|
|
{
|
|
int i0 = 0, j0 = 0, k0 = 0;
|
|
if (face == FACE_XMAX || face == FACE_XMIN) {
|
|
j0 = tid % ny;
|
|
k0 = tid / ny;
|
|
i0 = (face == FACE_XMAX) ? (nx - 1) : 0;
|
|
} else if (face == FACE_YMAX || face == FACE_YMIN) {
|
|
i0 = tid % nx;
|
|
k0 = tid / nx;
|
|
j0 = (face == FACE_YMAX) ? (ny - 1) : 0;
|
|
} else {
|
|
i0 = tid % nx;
|
|
j0 = tid / nx;
|
|
k0 = (face == FACE_ZMAX) ? (nz - 1) : 0;
|
|
}
|
|
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
const int p = idx_ex_d(i0, j0, k0);
|
|
|
|
const double x = x0 + i0 * d_gp.dX;
|
|
const double y = y0 + j0 * d_gp.dY;
|
|
const double z = z0 + k0 * d_gp.dZ;
|
|
const double r = sqrt(x * x + y * y + z * z);
|
|
if (r == 0.0) continue;
|
|
|
|
const double wx = velocity * x / r;
|
|
const double wy = velocity * y / r;
|
|
const double wz = velocity * z / r;
|
|
|
|
double fx = 0.0, fy = 0.0, fz = 0.0;
|
|
if (wx > 0.0) {
|
|
if (iF - 2 >= imin) fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
|
|
else if (iF - 1 >= imin) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
|
|
else fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
|
|
} else if (wx < 0.0) {
|
|
if (iF + 2 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF + 2, jF, kF)] + 4.0 * fh[idx_fh2(iF + 1, jF, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
|
|
else if (iF + 1 <= imax) fx = d_gp.d2dx * (-fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF + 1, jF, kF)]);
|
|
else fx = d_gp.d2dx * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF - 1, jF, kF)] + fh[idx_fh2(iF - 2, jF, kF)]);
|
|
}
|
|
|
|
if (wy > 0.0) {
|
|
if (jF - 2 >= jmin) fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
|
|
else if (jF - 1 >= jmin) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
|
|
else fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
|
|
} else if (wy < 0.0) {
|
|
if (jF + 2 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF + 2, kF)] + 4.0 * fh[idx_fh2(iF, jF + 1, kF)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
|
|
else if (jF + 1 <= jmax) fy = d_gp.d2dy * (-fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF + 1, kF)]);
|
|
else fy = d_gp.d2dy * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF - 1, kF)] + fh[idx_fh2(iF, jF - 2, kF)]);
|
|
}
|
|
|
|
if (wz > 0.0) {
|
|
if (kF - 2 >= kmin) fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
|
|
else if (kF - 1 >= kmin) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
|
|
else fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
|
|
} else if (wz < 0.0) {
|
|
if (kF + 2 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF + 2)] + 4.0 * fh[idx_fh2(iF, jF, kF + 1)] - 3.0 * fh[idx_fh2(iF, jF, kF)]);
|
|
else if (kF + 1 <= kmax) fz = d_gp.d2dz * (-fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF + 1)]);
|
|
else fz = d_gp.d2dz * (3.0 * fh[idx_fh2(iF, jF, kF)] - 4.0 * fh[idx_fh2(iF, jF, kF - 1)] + fh[idx_fh2(iF, jF, kF - 2)]);
|
|
}
|
|
|
|
f_rhs[p] = -velocity * (fx * x + fy * y + fz * z + fh[idx_fh2(iF, jF, kF)]) / r;
|
|
}
|
|
}
|
|
|
|
static void gpu_sommerfeld_routbam(double *d_f0, double *d_f_rhs,
|
|
double velocity,
|
|
double SoA0, double SoA1, double SoA2,
|
|
double *X, double *Y, double *Z,
|
|
const double *bbox,
|
|
int Symmetry)
|
|
{
|
|
if (velocity == 0.0) return;
|
|
const int nx = g_buf.prev_nx;
|
|
const int ny = g_buf.prev_ny;
|
|
const int nz = g_buf.prev_nz;
|
|
const double dX = X[1] - X[0];
|
|
const double dY = Y[1] - Y[0];
|
|
const double dZ = Z[1] - Z[0];
|
|
const bool touch_xmax = fabs(X[nx - 1] - bbox[3]) < dX;
|
|
const bool touch_ymax = fabs(Y[ny - 1] - bbox[4]) < dY;
|
|
const bool touch_zmax = fabs(Z[nz - 1] - bbox[5]) < dZ;
|
|
const bool touch_xmin = fabs(X[0] - bbox[0]) < dX &&
|
|
!(Symmetry == 2 && fabs(bbox[0]) < dX * 0.5);
|
|
const bool touch_ymin = fabs(Y[0] - bbox[1]) < dY &&
|
|
!(Symmetry == 2 && fabs(bbox[1]) < dY * 0.5);
|
|
const bool touch_zmin = fabs(Z[0] - bbox[2]) < dZ &&
|
|
!(Symmetry > 0 && fabs(bbox[2]) < dZ * 0.5);
|
|
|
|
const size_t w_pack = (size_t)(nx + 2) * (ny + 2) * (nz + 2);
|
|
kern_symbd_pack_ord2<<<grid(w_pack), BLK>>>(d_f0, g_buf.d_fh2, SoA0, SoA1, SoA2);
|
|
if (touch_xmax) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMAX, velocity, X[0], Y[0], Z[0]);
|
|
if (touch_ymax) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMAX, velocity, X[0], Y[0], Z[0]);
|
|
if (touch_zmax) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMAX, velocity, X[0], Y[0], Z[0]);
|
|
if (touch_xmin) kern_sommerfeld_face_bam<<<grid((size_t)ny * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_XMIN, velocity, X[0], Y[0], Z[0]);
|
|
if (touch_ymin) kern_sommerfeld_face_bam<<<grid((size_t)nx * nz), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_YMIN, velocity, X[0], Y[0], Z[0]);
|
|
if (touch_zmin) kern_sommerfeld_face_bam<<<grid((size_t)nx * ny), BLK>>>(g_buf.d_fh2, d_f_rhs, FACE_ZMIN, velocity, X[0], Y[0], Z[0]);
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* C. Point-wise computation kernels */
|
|
/* ================================================================== */
|
|
|
|
/* Phase 1: alpn1, chin1, gxx=dxx+1, gyy=dyy+1, gzz=dzz+1 */
|
|
__global__ void kern_phase1_prep(
|
|
const double* __restrict__ Lap, const double* __restrict__ chi,
|
|
const double* __restrict__ dxx, const double* __restrict__ dyy,
|
|
const double* __restrict__ dzz,
|
|
double* __restrict__ alpn1, double* __restrict__ chin1,
|
|
double* __restrict__ gxx, double* __restrict__ gyy, double* __restrict__ gzz)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
alpn1[i] = Lap[i] + 1.0;
|
|
chin1[i] = chi[i] + 1.0;
|
|
gxx[i] = dxx[i] + 1.0;
|
|
gyy[i] = dyy[i] + 1.0;
|
|
gzz[i] = dzz[i] + 1.0;
|
|
}
|
|
}
|
|
|
|
/* Phase 2a: chi_rhs, gij_rhs */
|
|
__global__ void kern_phase2_metric_rhs(
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
double* __restrict__ chi_rhs, double* __restrict__ gxx_rhs,
|
|
double* __restrict__ gyy_rhs, double* __restrict__ gzz_rhs,
|
|
double* __restrict__ gxy_rhs, double* __restrict__ gyz_rhs,
|
|
double* __restrict__ gxz_rhs)
|
|
{
|
|
const double F2o3 = 2.0/3.0, F1o3 = 1.0/3.0, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double db = betaxx[i] + betayy[i] + betazz[i];
|
|
chi_rhs[i] = F2o3 * chin1[i] * (alpn1[i] * trK[i] - db);
|
|
gxx_rhs[i] = -TWO*alpn1[i]*Axx[i] - F2o3*gxx[i]*db
|
|
+ TWO*(gxx[i]*betaxx[i] + gxy[i]*betayx[i] + gxz[i]*betazx[i]);
|
|
gyy_rhs[i] = -TWO*alpn1[i]*Ayy[i] - F2o3*gyy[i]*db
|
|
+ TWO*(gxy[i]*betaxy[i] + gyy[i]*betayy[i] + gyz[i]*betazy[i]);
|
|
gzz_rhs[i] = -TWO*alpn1[i]*Azz[i] - F2o3*gzz[i]*db
|
|
+ TWO*(gxz[i]*betaxz[i] + gyz[i]*betayz[i] + gzz[i]*betazz[i]);
|
|
gxy_rhs[i] = -TWO*alpn1[i]*Axy[i] + F1o3*gxy[i]*db
|
|
+ gxx[i]*betaxy[i] + gxz[i]*betazy[i] + gyy[i]*betayx[i]
|
|
+ gyz[i]*betazx[i] - gxy[i]*betazz[i];
|
|
gyz_rhs[i] = -TWO*alpn1[i]*Ayz[i] + F1o3*gyz[i]*db
|
|
+ gxy[i]*betaxz[i] + gyy[i]*betayz[i] + gxz[i]*betaxy[i]
|
|
+ gzz[i]*betazy[i] - gyz[i]*betaxx[i];
|
|
gxz_rhs[i] = -TWO*alpn1[i]*Axz[i] + F1o3*gxz[i]*db
|
|
+ gxx[i]*betaxz[i] + gxy[i]*betayz[i] + gyz[i]*betayx[i]
|
|
+ gzz[i]*betazx[i] - gxz[i]*betayy[i];
|
|
}
|
|
}
|
|
|
|
/* Phase 2b: metric inverse */
|
|
__global__ void kern_phase2_inverse(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
double* __restrict__ gupxx, double* __restrict__ gupxy,
|
|
double* __restrict__ gupxz, double* __restrict__ gupyy,
|
|
double* __restrict__ gupyz, double* __restrict__ gupzz)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double det = gxx[i]*gyy[i]*gzz[i] + gxy[i]*gyz[i]*gxz[i] + gxz[i]*gxy[i]*gyz[i]
|
|
- gxz[i]*gyy[i]*gxz[i] - gxy[i]*gxy[i]*gzz[i] - gxx[i]*gyz[i]*gyz[i];
|
|
double inv = 1.0 / det;
|
|
gupxx[i] = (gyy[i]*gzz[i] - gyz[i]*gyz[i]) * inv;
|
|
gupxy[i] = -(gxy[i]*gzz[i] - gyz[i]*gxz[i]) * inv;
|
|
gupxz[i] = (gxy[i]*gyz[i] - gyy[i]*gxz[i]) * inv;
|
|
gupyy[i] = (gxx[i]*gzz[i] - gxz[i]*gxz[i]) * inv;
|
|
gupyz[i] = -(gxx[i]*gyz[i] - gxy[i]*gxz[i]) * inv;
|
|
gupzz[i] = (gxx[i]*gyy[i] - gxy[i]*gxy[i]) * inv;
|
|
}
|
|
}
|
|
|
|
/* Phase 3: Gamma constraint residuals (co==0 only) */
|
|
__global__ void kern_phase3_gamma_constraint(
|
|
const double* __restrict__ Gamx, const double* __restrict__ Gamy,
|
|
const double* __restrict__ Gamz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ gxxx, const double* __restrict__ gxyx,
|
|
const double* __restrict__ gxzx, const double* __restrict__ gyyx,
|
|
const double* __restrict__ gyzx, const double* __restrict__ gzzx,
|
|
const double* __restrict__ gxxy, const double* __restrict__ gxyy,
|
|
const double* __restrict__ gxzy, const double* __restrict__ gyyy,
|
|
const double* __restrict__ gyzy, const double* __restrict__ gzzy,
|
|
const double* __restrict__ gxxz, const double* __restrict__ gxyz,
|
|
const double* __restrict__ gxzz, const double* __restrict__ gyyz,
|
|
const double* __restrict__ gyzz, const double* __restrict__ gzzz,
|
|
double* __restrict__ Gmx_Res, double* __restrict__ Gmy_Res,
|
|
double* __restrict__ Gmz_Res)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i], uxy=gupxy[i], uxz=gupxz[i];
|
|
double uyy=gupyy[i], uyz=gupyz[i], uzz=gupzz[i];
|
|
|
|
Gmx_Res[i] = Gamx[i] - (
|
|
uxx*(uxx*gxxx[i]+uxy*gxyx[i]+uxz*gxzx[i]) +
|
|
uxy*(uxx*gxyx[i]+uxy*gyyx[i]+uxz*gyzx[i]) +
|
|
uxz*(uxx*gxzx[i]+uxy*gyzx[i]+uxz*gzzx[i]) +
|
|
uxx*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) +
|
|
uxy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) +
|
|
uxz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) +
|
|
uxx*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
|
|
uxy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
|
|
uxz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
|
|
|
|
Gmy_Res[i] = Gamy[i] - (
|
|
uxx*(uxy*gxxx[i]+uyy*gxyx[i]+uyz*gxzx[i]) +
|
|
uxy*(uxy*gxyx[i]+uyy*gyyx[i]+uyz*gyzx[i]) +
|
|
uxz*(uxy*gxzx[i]+uyy*gyzx[i]+uyz*gzzx[i]) +
|
|
uxy*(uxy*gxxy[i]+uyy*gxyy[i]+uyz*gxzy[i]) +
|
|
uyy*(uxy*gxyy[i]+uyy*gyyy[i]+uyz*gyzy[i]) +
|
|
uyz*(uxy*gxzy[i]+uyy*gyzy[i]+uyz*gzzy[i]) +
|
|
uxy*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
|
|
uyy*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
|
|
uyz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
|
|
|
|
Gmz_Res[i] = Gamz[i] - (
|
|
uxx*(uxz*gxxx[i]+uyz*gxyx[i]+uzz*gxzx[i]) +
|
|
uxy*(uxz*gxyx[i]+uyz*gyyx[i]+uzz*gyzx[i]) +
|
|
uxz*(uxz*gxzx[i]+uyz*gyzx[i]+uzz*gzzx[i]) +
|
|
uxy*(uxz*gxxy[i]+uyz*gxyy[i]+uzz*gxzy[i]) +
|
|
uyy*(uxz*gxyy[i]+uyz*gyyy[i]+uzz*gyzy[i]) +
|
|
uyz*(uxz*gxzy[i]+uyz*gyzy[i]+uzz*gzzy[i]) +
|
|
uxz*(uxz*gxxz[i]+uyz*gxyz[i]+uzz*gxzz[i]) +
|
|
uyz*(uxz*gxyz[i]+uyz*gyyz[i]+uzz*gyzz[i]) +
|
|
uzz*(uxz*gxzz[i]+uyz*gyzz[i]+uzz*gzzz[i]));
|
|
}
|
|
}
|
|
|
|
/* Phase 4: 18 Christoffel symbols */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase4_christoffel(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ gxxx, const double* __restrict__ gxyx,
|
|
const double* __restrict__ gxzx, const double* __restrict__ gyyx,
|
|
const double* __restrict__ gyzx, const double* __restrict__ gzzx,
|
|
const double* __restrict__ gxxy, const double* __restrict__ gxyy,
|
|
const double* __restrict__ gxzy, const double* __restrict__ gyyy,
|
|
const double* __restrict__ gyzy, const double* __restrict__ gzzy,
|
|
const double* __restrict__ gxxz, const double* __restrict__ gxyz,
|
|
const double* __restrict__ gxzz, const double* __restrict__ gyyz,
|
|
const double* __restrict__ gyzz, const double* __restrict__ gzzz,
|
|
double* __restrict__ Gxxx, double* __restrict__ Gxxy, double* __restrict__ Gxxz,
|
|
double* __restrict__ Gxyy, double* __restrict__ Gxyz, double* __restrict__ Gxzz,
|
|
double* __restrict__ Gyxx, double* __restrict__ Gyxy, double* __restrict__ Gyxz,
|
|
double* __restrict__ Gyyy, double* __restrict__ Gyyz, double* __restrict__ Gyzz,
|
|
double* __restrict__ Gzxx, double* __restrict__ Gzxy, double* __restrict__ Gzxz,
|
|
double* __restrict__ Gzyy, double* __restrict__ Gzyz, double* __restrict__ Gzzz_o)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
/* Gamma^x_{xx} */
|
|
Gxxx[i]=H*(uxx*gxxx[i]+uxy*(TWO*gxyx[i]-gxxy[i])+uxz*(TWO*gxzx[i]-gxxz[i]));
|
|
Gyxx[i]=H*(uxy*gxxx[i]+uyy*(TWO*gxyx[i]-gxxy[i])+uyz*(TWO*gxzx[i]-gxxz[i]));
|
|
Gzxx[i]=H*(uxz*gxxx[i]+uyz*(TWO*gxyx[i]-gxxy[i])+uzz*(TWO*gxzx[i]-gxxz[i]));
|
|
/* yy */
|
|
Gxyy[i]=H*(uxx*(TWO*gxyy[i]-gyyx[i])+uxy*gyyy[i]+uxz*(TWO*gyzy[i]-gyyz[i]));
|
|
Gyyy[i]=H*(uxy*(TWO*gxyy[i]-gyyx[i])+uyy*gyyy[i]+uyz*(TWO*gyzy[i]-gyyz[i]));
|
|
Gzyy[i]=H*(uxz*(TWO*gxyy[i]-gyyx[i])+uyz*gyyy[i]+uzz*(TWO*gyzy[i]-gyyz[i]));
|
|
/* zz */
|
|
Gxzz[i]=H*(uxx*(TWO*gxzz[i]-gzzx[i])+uxy*(TWO*gyzz[i]-gzzy[i])+uxz*gzzz[i]);
|
|
Gyzz[i]=H*(uxy*(TWO*gxzz[i]-gzzx[i])+uyy*(TWO*gyzz[i]-gzzy[i])+uyz*gzzz[i]);
|
|
Gzzz_o[i]=H*(uxz*(TWO*gxzz[i]-gzzx[i])+uyz*(TWO*gyzz[i]-gzzy[i])+uzz*gzzz[i]);
|
|
/* xy */
|
|
Gxxy[i]=H*(uxx*gxxy[i]+uxy*gyyx[i]+uxz*(gxzy[i]+gyzx[i]-gxyz[i]));
|
|
Gyxy[i]=H*(uxy*gxxy[i]+uyy*gyyx[i]+uyz*(gxzy[i]+gyzx[i]-gxyz[i]));
|
|
Gzxy[i]=H*(uxz*gxxy[i]+uyz*gyyx[i]+uzz*(gxzy[i]+gyzx[i]-gxyz[i]));
|
|
/* xz */
|
|
Gxxz[i]=H*(uxx*gxxz[i]+uxy*(gxyz[i]+gyzx[i]-gxzy[i])+uxz*gzzx[i]);
|
|
Gyxz[i]=H*(uxy*gxxz[i]+uyy*(gxyz[i]+gyzx[i]-gxzy[i])+uyz*gzzx[i]);
|
|
Gzxz[i]=H*(uxz*gxxz[i]+uyz*(gxyz[i]+gyzx[i]-gxzy[i])+uzz*gzzx[i]);
|
|
/* yz */
|
|
Gxyz[i]=H*(uxx*(gxyz[i]+gxzy[i]-gyzx[i])+uxy*gyyz[i]+uxz*gzzy[i]);
|
|
Gyyz[i]=H*(uxy*(gxyz[i]+gxzy[i]-gyzx[i])+uyy*gyyz[i]+uyz*gzzy[i]);
|
|
Gzyz[i]=H*(uxz*(gxyz[i]+gxzy[i]-gyzx[i])+uyz*gyyz[i]+uzz*gzzy[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 5: A^ij = gup^ia gup^jb A_ab (stored temporarily in Rxx..Rzz) */
|
|
__global__ void kern_phase5_raise_A(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz,
|
|
double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz)
|
|
{
|
|
const double TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
Rxx[i]=uxx*uxx*Axx[i]+uxy*uxy*Ayy[i]+uxz*uxz*Azz[i]
|
|
+TWO*(uxx*uxy*Axy[i]+uxx*uxz*Axz[i]+uxy*uxz*Ayz[i]);
|
|
Ryy[i]=uxy*uxy*Axx[i]+uyy*uyy*Ayy[i]+uyz*uyz*Azz[i]
|
|
+TWO*(uxy*uyy*Axy[i]+uxy*uyz*Axz[i]+uyy*uyz*Ayz[i]);
|
|
Rzz[i]=uxz*uxz*Axx[i]+uyz*uyz*Ayy[i]+uzz*uzz*Azz[i]
|
|
+TWO*(uxz*uyz*Axy[i]+uxz*uzz*Axz[i]+uyz*uzz*Ayz[i]);
|
|
Rxy[i]=uxx*uxy*Axx[i]+uxy*uyy*Ayy[i]+uxz*uyz*Azz[i]
|
|
+(uxx*uyy+uxy*uxy)*Axy[i]+(uxx*uyz+uxz*uxy)*Axz[i]+(uxy*uyz+uxz*uyy)*Ayz[i];
|
|
Rxz[i]=uxx*uxz*Axx[i]+uxy*uyz*Ayy[i]+uxz*uzz*Azz[i]
|
|
+(uxx*uyz+uxy*uxz)*Axy[i]+(uxx*uzz+uxz*uxz)*Axz[i]+(uxy*uzz+uxz*uyz)*Ayz[i];
|
|
Ryz[i]=uxy*uxz*Axx[i]+uyy*uyz*Ayy[i]+uyz*uzz*Azz[i]
|
|
+(uxy*uyz+uyy*uxz)*Axy[i]+(uxy*uzz+uyz*uxz)*Axz[i]+(uyy*uzz+uyz*uyz)*Ayz[i];
|
|
}
|
|
}
|
|
|
|
/* Phase 6: Gamma_rhs part 1 (before fdderivs(beta) and fderivs(Gamma)) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase6_gamma_rhs_part1(
|
|
const double* __restrict__ Lapx, const double* __restrict__ Lapy,
|
|
const double* __restrict__ Lapz,
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Kx, const double* __restrict__ Ky,
|
|
const double* __restrict__ Kz,
|
|
const double* __restrict__ Sx, const double* __restrict__ Sy,
|
|
const double* __restrict__ Sz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs)
|
|
{
|
|
const double TWO=2.0, F3o2=1.5, F2o3=2.0/3.0, EIGHT=8.0;
|
|
const double PI_V = 3.14159265358979323846;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i];
|
|
double a=alpn1[i], c1=chin1[i];
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i];
|
|
|
|
Gamx_rhs[i] = -TWO*(lx*Rxx[i]+ly*Rxy[i]+lz*Rxz[i])
|
|
+ TWO*a*(
|
|
-F3o2/c1*(cx*Rxx[i]+cy*Rxy[i]+cz*Rxz[i])
|
|
-uxx*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
|
|
-uxy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
|
|
-uxz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
|
|
+Gxxx[i]*Rxx[i]+Gxyy[i]*Ryy[i]+Gxzz[i]*Rzz[i]
|
|
+TWO*(Gxxy[i]*Rxy[i]+Gxxz[i]*Rxz[i]+Gxyz[i]*Ryz[i]));
|
|
|
|
Gamy_rhs[i] = -TWO*(lx*Rxy[i]+ly*Ryy[i]+lz*Ryz[i])
|
|
+ TWO*a*(
|
|
-F3o2/c1*(cx*Rxy[i]+cy*Ryy[i]+cz*Ryz[i])
|
|
-uxy*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
|
|
-uyy*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
|
|
-uyz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
|
|
+Gyxx[i]*Rxx[i]+Gyyy[i]*Ryy[i]+Gyzz[i]*Rzz[i]
|
|
+TWO*(Gyxy[i]*Rxy[i]+Gyxz[i]*Rxz[i]+Gyyz[i]*Ryz[i]));
|
|
|
|
Gamz_rhs[i] = -TWO*(lx*Rxz[i]+ly*Ryz[i]+lz*Rzz[i])
|
|
+ TWO*a*(
|
|
-F3o2/c1*(cx*Rxz[i]+cy*Ryz[i]+cz*Rzz[i])
|
|
-uxz*(F2o3*Kx[i]+EIGHT*PI_V*Sx[i])
|
|
-uyz*(F2o3*Ky[i]+EIGHT*PI_V*Sy[i])
|
|
-uzz*(F2o3*Kz[i]+EIGHT*PI_V*Sz[i])
|
|
+Gzxx[i]*Rxx[i]+Gzyy[i]*Ryy[i]+Gzzz[i]*Rzz[i]
|
|
+TWO*(Gzxy[i]*Rxy[i]+Gzxz[i]*Rxz[i]+Gzyz[i]*Ryz[i]));
|
|
}
|
|
}
|
|
|
|
/* Phase 5+6 fused: raise A^ij in registers, then consume immediately in Gamma_rhs. */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase5_6_gamma_rhs_part1_fused(
|
|
const double* __restrict__ Lapx, const double* __restrict__ Lapy,
|
|
const double* __restrict__ Lapz,
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ Kx, const double* __restrict__ Ky,
|
|
const double* __restrict__ Kz,
|
|
const double* __restrict__ Sx, const double* __restrict__ Sy,
|
|
const double* __restrict__ Sz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs)
|
|
{
|
|
const double TWO = 2.0, F3o2 = 1.5, F2o3 = 2.0 / 3.0, EIGHT = 8.0;
|
|
const double PI_V = 3.14159265358979323846;
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) {
|
|
const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
|
|
const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
|
|
const double Axx_v = Axx[i], Axy_v = Axy[i], Axz_v = Axz[i];
|
|
const double Ayy_v = Ayy[i], Ayz_v = Ayz[i], Azz_v = Azz[i];
|
|
|
|
const double Rxx_v = uxx * uxx * Axx_v + uxy * uxy * Ayy_v + uxz * uxz * Azz_v
|
|
+ TWO * (uxx * uxy * Axy_v + uxx * uxz * Axz_v + uxy * uxz * Ayz_v);
|
|
const double Ryy_v = uxy * uxy * Axx_v + uyy * uyy * Ayy_v + uyz * uyz * Azz_v
|
|
+ TWO * (uxy * uyy * Axy_v + uxy * uyz * Axz_v + uyy * uyz * Ayz_v);
|
|
const double Rzz_v = uxz * uxz * Axx_v + uyz * uyz * Ayy_v + uzz * uzz * Azz_v
|
|
+ TWO * (uxz * uyz * Axy_v + uxz * uzz * Axz_v + uyz * uzz * Ayz_v);
|
|
const double Rxy_v = uxx * uxy * Axx_v + uxy * uyy * Ayy_v + uxz * uyz * Azz_v
|
|
+ (uxx * uyy + uxy * uxy) * Axy_v
|
|
+ (uxx * uyz + uxz * uxy) * Axz_v
|
|
+ (uxy * uyz + uxz * uyy) * Ayz_v;
|
|
const double Rxz_v = uxx * uxz * Axx_v + uxy * uyz * Ayy_v + uxz * uzz * Azz_v
|
|
+ (uxx * uyz + uxy * uxz) * Axy_v
|
|
+ (uxx * uzz + uxz * uxz) * Axz_v
|
|
+ (uxy * uzz + uxz * uyz) * Ayz_v;
|
|
const double Ryz_v = uxy * uxz * Axx_v + uyy * uyz * Ayy_v + uyz * uzz * Azz_v
|
|
+ (uxy * uyz + uyy * uxz) * Axy_v
|
|
+ (uxy * uzz + uyz * uxz) * Axz_v
|
|
+ (uyy * uzz + uyz * uyz) * Ayz_v;
|
|
|
|
const double lx = Lapx[i], ly = Lapy[i], lz = Lapz[i];
|
|
const double a = alpn1[i], c1 = chin1[i];
|
|
const double cx = chix[i], cy = chiy[i], cz = chiz[i];
|
|
|
|
Gamx_rhs[i] = -TWO * (lx * Rxx_v + ly * Rxy_v + lz * Rxz_v)
|
|
+ TWO * a * (
|
|
-F3o2 / c1 * (cx * Rxx_v + cy * Rxy_v + cz * Rxz_v)
|
|
-uxx * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i])
|
|
-uxy * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i])
|
|
-uxz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i])
|
|
+ Gxxx[i] * Rxx_v + Gxyy[i] * Ryy_v + Gxzz[i] * Rzz_v
|
|
+ TWO * (Gxxy[i] * Rxy_v + Gxxz[i] * Rxz_v + Gxyz[i] * Ryz_v));
|
|
|
|
Gamy_rhs[i] = -TWO * (lx * Rxy_v + ly * Ryy_v + lz * Ryz_v)
|
|
+ TWO * a * (
|
|
-F3o2 / c1 * (cx * Rxy_v + cy * Ryy_v + cz * Ryz_v)
|
|
-uxy * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i])
|
|
-uyy * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i])
|
|
-uyz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i])
|
|
+ Gyxx[i] * Rxx_v + Gyyy[i] * Ryy_v + Gyzz[i] * Rzz_v
|
|
+ TWO * (Gyxy[i] * Rxy_v + Gyxz[i] * Rxz_v + Gyyz[i] * Ryz_v));
|
|
|
|
Gamz_rhs[i] = -TWO * (lx * Rxz_v + ly * Ryz_v + lz * Rzz_v)
|
|
+ TWO * a * (
|
|
-F3o2 / c1 * (cx * Rxz_v + cy * Ryz_v + cz * Rzz_v)
|
|
-uxz * (F2o3 * Kx[i] + EIGHT * PI_V * Sx[i])
|
|
-uyz * (F2o3 * Ky[i] + EIGHT * PI_V * Sy[i])
|
|
-uzz * (F2o3 * Kz[i] + EIGHT * PI_V * Sz[i])
|
|
+ Gzxx[i] * Rxx_v + Gzyy[i] * Ryy_v + Gzzz[i] * Rzz_v
|
|
+ TWO * (Gzxy[i] * Rxy_v + Gzxz[i] * Rxz_v + Gzyz[i] * Ryz_v));
|
|
}
|
|
}
|
|
|
|
/* Phase 8: Gamma_rhs part 2 — after fdderivs(beta) and fderivs(Gamma)
|
|
* Computes: fxx=div(beta_xx), Gamxa, then updates Gamx_rhs etc.
|
|
* Input arrays gxxx..gzzz here hold fdderivs(beta) results,
|
|
* Gamxx..Gamzz hold fderivs(Gamma) results.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase8_gamma_rhs_part2(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
/* fdderivs(betax) -> gxxx,gxyx,gxzx,gyyx,gyzx,gzzx */
|
|
const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy,
|
|
const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy,
|
|
const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz,
|
|
/* fdderivs(betay) -> gxxy,gxyy,gxzy,gyyy,gyzy,gzzy */
|
|
const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy,
|
|
const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy,
|
|
const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz,
|
|
/* fdderivs(betaz) -> gxxz,gxyz,gxzz,gyyz,gyzz,gzzz */
|
|
const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy,
|
|
const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy,
|
|
const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz,
|
|
/* fderivs(Gamx) -> Gamxx,Gamxy,Gamxz */
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
/* fderivs(Gamy) -> Gamyx,Gamyy,Gamyz */
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
/* fderivs(Gamz) -> Gamzx,Gamzy,Gamzz */
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
/* Christoffel symbols */
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* betaij first derivs */
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs,
|
|
double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out,
|
|
double* __restrict__ Gamza_out)
|
|
{
|
|
const double TWO=2.0, F2o3=2.0/3.0, F1o3=1.0/3.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
/* div(beta_second_derivs) */
|
|
double fxx_v = bxx_xx[i]+bxy_xy[i]+bxz_xz[i];
|
|
double fxy_v = bxx_xy[i]+bxy_yy[i]+bxz_yz[i];
|
|
double fxz_v = bxx_xz[i]+bxy_yz[i]+bxz_zz[i];
|
|
/* Gamma^a contracted */
|
|
double Ga_x = uxx*Gxxx[i]+uyy*Gxyy[i]+uzz*Gxzz[i]
|
|
+TWO*(uxy*Gxxy[i]+uxz*Gxxz[i]+uyz*Gxyz[i]);
|
|
double Ga_y = uxx*Gyxx[i]+uyy*Gyyy[i]+uzz*Gyzz[i]
|
|
+TWO*(uxy*Gyxy[i]+uxz*Gyxz[i]+uyz*Gyyz[i]);
|
|
double Ga_z = uxx*Gzxx[i]+uyy*Gzyy[i]+uzz*Gzzz[i]
|
|
+TWO*(uxy*Gzxy[i]+uxz*Gzxz[i]+uyz*Gzyz[i]);
|
|
Gamxa_out[i]=Ga_x; Gamya_out[i]=Ga_y; Gamza_out[i]=Ga_z;
|
|
double db = betaxx[i] + betayy[i] + betazz[i];
|
|
Gamx_rhs[i] += F2o3*Ga_x*db
|
|
- Ga_x*betaxx[i] - Ga_y*betaxy[i] - Ga_z*betaxz[i]
|
|
+ F1o3*(uxx*fxx_v+uxy*fxy_v+uxz*fxz_v)
|
|
+ uxx*bxx_xx[i]+uyy*bxx_yy[i]+uzz*bxx_zz[i]
|
|
+ TWO*(uxy*bxx_xy[i]+uxz*bxx_xz[i]+uyz*bxx_yz[i]);
|
|
Gamy_rhs[i] += F2o3*Ga_y*db
|
|
- Ga_x*betayx[i] - Ga_y*betayy[i] - Ga_z*betayz[i]
|
|
+ F1o3*(uxy*fxx_v+uyy*fxy_v+uyz*fxz_v)
|
|
+ uxx*bxy_xx[i]+uyy*bxy_yy[i]+uzz*bxy_zz[i]
|
|
+ TWO*(uxy*bxy_xy[i]+uxz*bxy_xz[i]+uyz*bxy_yz[i]);
|
|
Gamz_rhs[i] += F2o3*Ga_z*db
|
|
- Ga_x*betazx[i] - Ga_y*betazy[i] - Ga_z*betazz[i]
|
|
+ F1o3*(uxz*fxx_v+uyz*fxy_v+uzz*fxz_v)
|
|
+ uxx*bxz_xx[i]+uyy*bxz_yy[i]+uzz*bxz_zz[i]
|
|
+ TWO*(uxy*bxz_xy[i]+uxz*bxz_xz[i]+uyz*bxz_yz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 9: Christoffel contract — compute g_{ia} Gamma^a_{bc} products
|
|
* Overwrites gxxx..gzzz with lowered Christoffel products needed for Ricci.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase9_christoffel_contract(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* output: lowered products g_{ia} Gamma^a_{bc} */
|
|
double* __restrict__ o_gxxx, double* __restrict__ o_gxyx,
|
|
double* __restrict__ o_gxzx, double* __restrict__ o_gyyx,
|
|
double* __restrict__ o_gyzx, double* __restrict__ o_gzzx,
|
|
double* __restrict__ o_gxxy, double* __restrict__ o_gxyy,
|
|
double* __restrict__ o_gxzy, double* __restrict__ o_gyyy,
|
|
double* __restrict__ o_gyzy, double* __restrict__ o_gzzy,
|
|
double* __restrict__ o_gxxz, double* __restrict__ o_gxyz,
|
|
double* __restrict__ o_gxzz, double* __restrict__ o_gyyz,
|
|
double* __restrict__ o_gyzz, double* __restrict__ o_gzzz)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double g11=gxx[i],g12=gxy[i],g13=gxz[i];
|
|
double g22=gyy[i],g23=gyz[i],g33=gzz[i];
|
|
/* row x: g_{x,a} Gamma^a_{bc} */
|
|
o_gxxx[i]=g11*Gxxx[i]+g12*Gyxx[i]+g13*Gzxx[i];
|
|
o_gxyx[i]=g11*Gxxy[i]+g12*Gyxy[i]+g13*Gzxy[i];
|
|
o_gxzx[i]=g11*Gxxz[i]+g12*Gyxz[i]+g13*Gzxz[i];
|
|
o_gyyx[i]=g11*Gxyy[i]+g12*Gyyy[i]+g13*Gzyy[i];
|
|
o_gyzx[i]=g11*Gxyz[i]+g12*Gyyz[i]+g13*Gzyz[i];
|
|
o_gzzx[i]=g11*Gxzz[i]+g12*Gyzz[i]+g13*Gzzz[i];
|
|
/* row y: g_{y,a} Gamma^a_{bc} */
|
|
o_gxxy[i]=g12*Gxxx[i]+g22*Gyxx[i]+g23*Gzxx[i];
|
|
o_gxyy[i]=g12*Gxxy[i]+g22*Gyxy[i]+g23*Gzxy[i];
|
|
o_gxzy[i]=g12*Gxxz[i]+g22*Gyxz[i]+g23*Gzxz[i];
|
|
o_gyyy[i]=g12*Gxyy[i]+g22*Gyyy[i]+g23*Gzyy[i];
|
|
o_gyzy[i]=g12*Gxyz[i]+g22*Gyyz[i]+g23*Gzyz[i];
|
|
o_gzzy[i]=g12*Gxzz[i]+g22*Gyzz[i]+g23*Gzzz[i];
|
|
/* row z: g_{z,a} Gamma^a_{bc} */
|
|
o_gxxz[i]=g13*Gxxx[i]+g23*Gyxx[i]+g33*Gzxx[i];
|
|
o_gxyz[i]=g13*Gxxy[i]+g23*Gyxy[i]+g33*Gzxy[i];
|
|
o_gxzz[i]=g13*Gxxz[i]+g23*Gyxz[i]+g33*Gzxz[i];
|
|
o_gyyz[i]=g13*Gxyy[i]+g23*Gyyy[i]+g33*Gzyy[i];
|
|
o_gyzz[i]=g13*Gxyz[i]+g23*Gyyz[i]+g33*Gzyz[i];
|
|
o_gzzz[i]=g13*Gxzz[i]+g23*Gyzz[i]+g33*Gzzz[i];
|
|
}
|
|
}
|
|
|
|
/* Phase 8+9 fused: update Gamma rhs, contract Gamma^a, and lower Christoffels in one pass. */
|
|
__global__ __launch_bounds__(128, 2)
|
|
void kern_phase8_9_gamma_rhs_contract_fused(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ bxx_xx, const double* __restrict__ bxx_xy,
|
|
const double* __restrict__ bxx_xz, const double* __restrict__ bxx_yy,
|
|
const double* __restrict__ bxx_yz, const double* __restrict__ bxx_zz,
|
|
const double* __restrict__ bxy_xx, const double* __restrict__ bxy_xy,
|
|
const double* __restrict__ bxy_xz, const double* __restrict__ bxy_yy,
|
|
const double* __restrict__ bxy_yz, const double* __restrict__ bxy_zz,
|
|
const double* __restrict__ bxz_xx, const double* __restrict__ bxz_xy,
|
|
const double* __restrict__ bxz_xz, const double* __restrict__ bxz_yy,
|
|
const double* __restrict__ bxz_yz, const double* __restrict__ bxz_zz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs,
|
|
double* __restrict__ Gamxa_out, double* __restrict__ Gamya_out,
|
|
double* __restrict__ Gamza_out,
|
|
double* __restrict__ o_gxxx, double* __restrict__ o_gxyx,
|
|
double* __restrict__ o_gxzx, double* __restrict__ o_gyyx,
|
|
double* __restrict__ o_gyzx, double* __restrict__ o_gzzx,
|
|
double* __restrict__ o_gxxy, double* __restrict__ o_gxyy,
|
|
double* __restrict__ o_gxzy, double* __restrict__ o_gyyy,
|
|
double* __restrict__ o_gyzy, double* __restrict__ o_gzzy,
|
|
double* __restrict__ o_gxxz, double* __restrict__ o_gxyz,
|
|
double* __restrict__ o_gxzz, double* __restrict__ o_gyyz,
|
|
double* __restrict__ o_gyzz, double* __restrict__ o_gzzz)
|
|
{
|
|
const double TWO = 2.0, F2o3 = 2.0 / 3.0, F1o3 = 1.0 / 3.0;
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) {
|
|
const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
|
|
const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
|
|
|
|
const double Gxxx_v = Gxxx[i], Gxxy_v = Gxxy[i], Gxxz_v = Gxxz[i];
|
|
const double Gxyy_v = Gxyy[i], Gxyz_v = Gxyz[i], Gxzz_v = Gxzz[i];
|
|
const double Gyxx_v = Gyxx[i], Gyxy_v = Gyxy[i], Gyxz_v = Gyxz[i];
|
|
const double Gyyy_v = Gyyy[i], Gyyz_v = Gyyz[i], Gyzz_v = Gyzz[i];
|
|
const double Gzxx_v = Gzxx[i], Gzxy_v = Gzxy[i], Gzxz_v = Gzxz[i];
|
|
const double Gzyy_v = Gzyy[i], Gzyz_v = Gzyz[i], Gzzz_v = Gzzz[i];
|
|
|
|
const double fxx_v = bxx_xx[i] + bxy_xy[i] + bxz_xz[i];
|
|
const double fxy_v = bxx_xy[i] + bxy_yy[i] + bxz_yz[i];
|
|
const double fxz_v = bxx_xz[i] + bxy_yz[i] + bxz_zz[i];
|
|
|
|
const double Ga_x = uxx * Gxxx_v + uyy * Gxyy_v + uzz * Gxzz_v
|
|
+ TWO * (uxy * Gxxy_v + uxz * Gxxz_v + uyz * Gxyz_v);
|
|
const double Ga_y = uxx * Gyxx_v + uyy * Gyyy_v + uzz * Gyzz_v
|
|
+ TWO * (uxy * Gyxy_v + uxz * Gyxz_v + uyz * Gyyz_v);
|
|
const double Ga_z = uxx * Gzxx_v + uyy * Gzyy_v + uzz * Gzzz_v
|
|
+ TWO * (uxy * Gzxy_v + uxz * Gzxz_v + uyz * Gzyz_v);
|
|
|
|
Gamxa_out[i] = Ga_x;
|
|
Gamya_out[i] = Ga_y;
|
|
Gamza_out[i] = Ga_z;
|
|
|
|
const double betaxx_v = betaxx[i], betaxy_v = betaxy[i], betaxz_v = betaxz[i];
|
|
const double betayx_v = betayx[i], betayy_v = betayy[i], betayz_v = betayz[i];
|
|
const double betazx_v = betazx[i], betazy_v = betazy[i], betazz_v = betazz[i];
|
|
const double db = betaxx_v + betayy_v + betazz_v;
|
|
|
|
Gamx_rhs[i] += F2o3 * Ga_x * db
|
|
- Ga_x * betaxx_v - Ga_y * betaxy_v - Ga_z * betaxz_v
|
|
+ F1o3 * (uxx * fxx_v + uxy * fxy_v + uxz * fxz_v)
|
|
+ uxx * bxx_xx[i] + uyy * bxx_yy[i] + uzz * bxx_zz[i]
|
|
+ TWO * (uxy * bxx_xy[i] + uxz * bxx_xz[i] + uyz * bxx_yz[i]);
|
|
Gamy_rhs[i] += F2o3 * Ga_y * db
|
|
- Ga_x * betayx_v - Ga_y * betayy_v - Ga_z * betayz_v
|
|
+ F1o3 * (uxy * fxx_v + uyy * fxy_v + uyz * fxz_v)
|
|
+ uxx * bxy_xx[i] + uyy * bxy_yy[i] + uzz * bxy_zz[i]
|
|
+ TWO * (uxy * bxy_xy[i] + uxz * bxy_xz[i] + uyz * bxy_yz[i]);
|
|
Gamz_rhs[i] += F2o3 * Ga_z * db
|
|
- Ga_x * betazx_v - Ga_y * betazy_v - Ga_z * betazz_v
|
|
+ F1o3 * (uxz * fxx_v + uyz * fxy_v + uzz * fxz_v)
|
|
+ uxx * bxz_xx[i] + uyy * bxz_yy[i] + uzz * bxz_zz[i]
|
|
+ TWO * (uxy * bxz_xy[i] + uxz * bxz_xz[i] + uyz * bxz_yz[i]);
|
|
|
|
const double g11 = gxx[i], g12 = gxy[i], g13 = gxz[i];
|
|
const double g22 = gyy[i], g23 = gyz[i], g33 = gzz[i];
|
|
|
|
o_gxxx[i] = g11 * Gxxx_v + g12 * Gyxx_v + g13 * Gzxx_v;
|
|
o_gxyx[i] = g11 * Gxxy_v + g12 * Gyxy_v + g13 * Gzxy_v;
|
|
o_gxzx[i] = g11 * Gxxz_v + g12 * Gyxz_v + g13 * Gzxz_v;
|
|
o_gyyx[i] = g11 * Gxyy_v + g12 * Gyyy_v + g13 * Gzyy_v;
|
|
o_gyzx[i] = g11 * Gxyz_v + g12 * Gyyz_v + g13 * Gzyz_v;
|
|
o_gzzx[i] = g11 * Gxzz_v + g12 * Gyzz_v + g13 * Gzzz_v;
|
|
|
|
o_gxxy[i] = g12 * Gxxx_v + g22 * Gyxx_v + g23 * Gzxx_v;
|
|
o_gxyy[i] = g12 * Gxxy_v + g22 * Gyxy_v + g23 * Gzxy_v;
|
|
o_gxzy[i] = g12 * Gxxz_v + g22 * Gyxz_v + g23 * Gzxz_v;
|
|
o_gyyy[i] = g12 * Gxyy_v + g22 * Gyyy_v + g23 * Gzyy_v;
|
|
o_gyzy[i] = g12 * Gxyz_v + g22 * Gyyz_v + g23 * Gzyz_v;
|
|
o_gzzy[i] = g12 * Gxzz_v + g22 * Gyzz_v + g23 * Gzzz_v;
|
|
|
|
o_gxxz[i] = g13 * Gxxx_v + g23 * Gyxx_v + g33 * Gzxx_v;
|
|
o_gxyz[i] = g13 * Gxxy_v + g23 * Gyxy_v + g33 * Gzxy_v;
|
|
o_gxzz[i] = g13 * Gxxz_v + g23 * Gyxz_v + g33 * Gzxz_v;
|
|
o_gyyz[i] = g13 * Gxyy_v + g23 * Gyyy_v + g33 * Gzyy_v;
|
|
o_gyzz[i] = g13 * Gxyz_v + g23 * Gyyz_v + g33 * Gzyz_v;
|
|
o_gzzz[i] = g13 * Gxzz_v + g23 * Gyzz_v + g33 * Gzzz_v;
|
|
}
|
|
}
|
|
|
|
/* Phase 10: After fdderivs of a metric component, contract with gup^{ij}
|
|
* R_comp = gup^xx*fxx + gup^yy*fyy + gup^zz*fzz + 2*(gup^xy*fxy + gup^xz*fxz + gup^yz*fyz)
|
|
*/
|
|
__global__ void kern_phase10_ricci_contract(
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ fxx, const double* __restrict__ fxy,
|
|
const double* __restrict__ fxz, const double* __restrict__ fyy,
|
|
const double* __restrict__ fyz, const double* __restrict__ fzz,
|
|
double* __restrict__ R_comp)
|
|
{
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
R_comp[i] = gupxx[i]*fxx[i] + gupyy[i]*fyy[i] + gupzz[i]*fzz[i]
|
|
+ 2.0*(gupxy[i]*fxy[i] + gupxz[i]*fxz[i] + gupyz[i]*fyz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 11a: Ricci diagonal assembly (Rxx, Ryy, Rzz) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase11_ricci_diag(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
|
|
const double* __restrict__ Gamza,
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* lowered Christoffel products */
|
|
const double* __restrict__ lxxx, const double* __restrict__ lxyx,
|
|
const double* __restrict__ lxzx, const double* __restrict__ lyyx,
|
|
const double* __restrict__ lyzx, const double* __restrict__ lzzx,
|
|
const double* __restrict__ lxxy, const double* __restrict__ lxyy,
|
|
const double* __restrict__ lxzy, const double* __restrict__ lyyy,
|
|
const double* __restrict__ lyzy, const double* __restrict__ lzzy,
|
|
const double* __restrict__ lxxz, const double* __restrict__ lxyz,
|
|
const double* __restrict__ lxzz, const double* __restrict__ lyyz,
|
|
const double* __restrict__ lyzz, const double* __restrict__ lzzz,
|
|
double* __restrict__ Rxx, double* __restrict__ Ryy, double* __restrict__ Rzz)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
/* Rxx */
|
|
Rxx[i] = -H*Rxx[i]
|
|
+ gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i]
|
|
+ Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i]
|
|
+ uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i])
|
|
+(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i]))
|
|
+ uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i]
|
|
+Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i])
|
|
+(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i])
|
|
+(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]))
|
|
+ uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i]
|
|
+Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i])
|
|
+(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i])
|
|
+(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]))
|
|
+ uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i])
|
|
+(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
|
|
+ uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i])
|
|
+(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i])
|
|
+(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]))
|
|
+ uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i])
|
|
+(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]));
|
|
|
|
/* Ryy */
|
|
Ryy[i] = -H*Ryy[i]
|
|
+ gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i]
|
|
+ Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i]
|
|
+ uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i])
|
|
+(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
|
|
+ uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
|
|
+Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i])
|
|
+(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i])
|
|
+(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]))
|
|
+ uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
|
|
+Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i])
|
|
+(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i])
|
|
+(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]))
|
|
+ uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i])
|
|
+(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i]))
|
|
+ uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i]
|
|
+Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i])
|
|
+(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i])
|
|
+(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]))
|
|
+ uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i])
|
|
+(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]));
|
|
|
|
/* Rzz */
|
|
Rzz[i] = -H*Rzz[i]
|
|
+ gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i]
|
|
+ Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i]
|
|
+ uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i])
|
|
+(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]))
|
|
+ uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
|
|
+Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i])
|
|
+(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i])
|
|
+(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]))
|
|
+ uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
|
|
+Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i])
|
|
+(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i])
|
|
+(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]))
|
|
+ uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i])
|
|
+(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]))
|
|
+ uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
|
|
+Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i])
|
|
+(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i])
|
|
+(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]))
|
|
+ uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i])
|
|
+(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i]));
|
|
}
|
|
}
|
|
|
|
/* Phase 11b: Ricci off-diagonal assembly (Rxy, Rxz, Ryz) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase11_ricci_offdiag(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
|
|
const double* __restrict__ Gamza,
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
const double* __restrict__ lxxx, const double* __restrict__ lxyx,
|
|
const double* __restrict__ lxzx, const double* __restrict__ lyyx,
|
|
const double* __restrict__ lyzx, const double* __restrict__ lzzx,
|
|
const double* __restrict__ lxxy, const double* __restrict__ lxyy,
|
|
const double* __restrict__ lxzy, const double* __restrict__ lyyy,
|
|
const double* __restrict__ lyzy, const double* __restrict__ lzzy,
|
|
const double* __restrict__ lxxz, const double* __restrict__ lxyz,
|
|
const double* __restrict__ lxzz, const double* __restrict__ lyyz,
|
|
const double* __restrict__ lyzz, const double* __restrict__ lzzz,
|
|
double* __restrict__ Rxy, double* __restrict__ Rxz, double* __restrict__ Ryz)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
|
|
/* Rxy */
|
|
Rxy[i] = H*(
|
|
-Rxy[i]
|
|
+gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i]
|
|
+gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i]
|
|
+Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i]
|
|
+Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i])
|
|
+uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i]
|
|
+Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]
|
|
+Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])
|
|
+uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i]
|
|
+Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]
|
|
+Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]
|
|
+Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]
|
|
+Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i]
|
|
+Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i])
|
|
+uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]
|
|
+Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
|
|
+Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
|
|
+Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
|
|
+uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
|
|
+Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i]
|
|
+Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])
|
|
+uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
|
|
+Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i]
|
|
+Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i]
|
|
+Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
|
|
+Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
|
|
+Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]);
|
|
|
|
/* Rxz */
|
|
Rxz[i] = H*(
|
|
-Rxz[i]
|
|
+gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i]
|
|
+gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i]
|
|
+Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i]
|
|
+Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i])
|
|
+uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i]
|
|
+Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]
|
|
+Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])
|
|
+uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]
|
|
+Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
|
|
+Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
|
|
+Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
|
|
+uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i]
|
|
+Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]
|
|
+Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]
|
|
+Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]
|
|
+Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i]
|
|
+Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i])
|
|
+uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
|
|
+Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
|
|
+Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]
|
|
+Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
|
|
+Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i]
|
|
+Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
|
|
+uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
|
|
+Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i]
|
|
+Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]);
|
|
|
|
/* Ryz */
|
|
Ryz[i] = H*(
|
|
-Ryz[i]
|
|
+gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i]
|
|
+gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i]
|
|
+Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i]
|
|
+Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i])
|
|
+uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
|
|
+Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])
|
|
+uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
|
|
+Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
|
|
+Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i]
|
|
+Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i]
|
|
+Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
|
|
+Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]
|
|
+Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]
|
|
+Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i]
|
|
+Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
|
|
+uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i]
|
|
+Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]
|
|
+Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])
|
|
+uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i]
|
|
+Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]
|
|
+Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]
|
|
+Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]
|
|
+Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i]
|
|
+Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i])
|
|
+uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
|
|
+Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i]
|
|
+Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 11: fused Ricci assembly (diag + off-diag) */
|
|
__global__ __launch_bounds__(128, 2)
|
|
void kern_phase11_ricci_fused(
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gamxa, const double* __restrict__ Gamya,
|
|
const double* __restrict__ Gamza,
|
|
const double* __restrict__ Gamxx, const double* __restrict__ Gamxy,
|
|
const double* __restrict__ Gamxz,
|
|
const double* __restrict__ Gamyx, const double* __restrict__ Gamyy_d,
|
|
const double* __restrict__ Gamyz_d,
|
|
const double* __restrict__ Gamzx, const double* __restrict__ Gamzy,
|
|
const double* __restrict__ Gamzz_d,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
const double* __restrict__ lxxx, const double* __restrict__ lxyx,
|
|
const double* __restrict__ lxzx, const double* __restrict__ lyyx,
|
|
const double* __restrict__ lyzx, const double* __restrict__ lzzx,
|
|
const double* __restrict__ lxxy, const double* __restrict__ lxyy,
|
|
const double* __restrict__ lxzy, const double* __restrict__ lyyy,
|
|
const double* __restrict__ lyzy, const double* __restrict__ lzzy,
|
|
const double* __restrict__ lxxz, const double* __restrict__ lxyz,
|
|
const double* __restrict__ lxzz, const double* __restrict__ lyyz,
|
|
const double* __restrict__ lyzz, const double* __restrict__ lzzz,
|
|
double* __restrict__ Rxx, double* __restrict__ Rxy, double* __restrict__ Rxz,
|
|
double* __restrict__ Ryy, double* __restrict__ Ryz, double* __restrict__ Rzz)
|
|
{
|
|
const double H = 0.5, TWO = 2.0;
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all; i += blockDim.x * gridDim.x) {
|
|
double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
|
|
double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
|
|
|
|
Rxx[i] = -H*Rxx[i]
|
|
+ gxx[i]*Gamxx[i]+gxy[i]*Gamyx[i]+gxz[i]*Gamzx[i]
|
|
+ Gamxa[i]*lxxx[i]+Gamya[i]*lxyx[i]+Gamza[i]*lxzx[i]
|
|
+ uxx*(TWO*(Gxxx[i]*lxxx[i]+Gyxx[i]*lxyx[i]+Gzxx[i]*lxzx[i])
|
|
+(Gxxx[i]*lxxx[i]+Gyxx[i]*lxxy[i]+Gzxx[i]*lxxz[i]))
|
|
+ uxy*(TWO*(Gxxx[i]*lxyx[i]+Gyxx[i]*lyyx[i]+Gzxx[i]*lyzx[i]
|
|
+Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i])
|
|
+(Gxxy[i]*lxxx[i]+Gyxy[i]*lxxy[i]+Gzxy[i]*lxxz[i])
|
|
+(Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i]))
|
|
+ uxz*(TWO*(Gxxx[i]*lxzx[i]+Gyxx[i]*lyzx[i]+Gzxx[i]*lzzx[i]
|
|
+Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i])
|
|
+(Gxxz[i]*lxxx[i]+Gyxz[i]*lxxy[i]+Gzxz[i]*lxxz[i])
|
|
+(Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i]))
|
|
+ uyy*(TWO*(Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i])
|
|
+(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
|
|
+ uyz*(TWO*(Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i])
|
|
+(Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i])
|
|
+(Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]))
|
|
+ uzz*(TWO*(Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i])
|
|
+(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]));
|
|
|
|
Ryy[i] = -H*Ryy[i]
|
|
+ gxy[i]*Gamxy[i]+gyy[i]*Gamyy_d[i]+gyz[i]*Gamzy[i]
|
|
+ Gamxa[i]*lxyy[i]+Gamya[i]*lyyy[i]+Gamza[i]*lyzy[i]
|
|
+ uxx*(TWO*(Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i])
|
|
+(Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]))
|
|
+ uxy*(TWO*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
|
|
+Gxyy[i]*lxxy[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxzy[i])
|
|
+(Gxyy[i]*lxyx[i]+Gyyy[i]*lxyy[i]+Gzyy[i]*lxyz[i])
|
|
+(Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i]))
|
|
+ uxz*(TWO*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
|
|
+Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i])
|
|
+(Gxyz[i]*lxyx[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxyz[i])
|
|
+(Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i]))
|
|
+ uyy*(TWO*(Gxyy[i]*lxyy[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyzy[i])
|
|
+(Gxyy[i]*lyyx[i]+Gyyy[i]*lyyy[i]+Gzyy[i]*lyyz[i]))
|
|
+ uyz*(TWO*(Gxyy[i]*lxzy[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lzzy[i]
|
|
+Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i])
|
|
+(Gxyz[i]*lyyx[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyyz[i])
|
|
+(Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i]))
|
|
+ uzz*(TWO*(Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i])
|
|
+(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]));
|
|
|
|
Rzz[i] = -H*Rzz[i]
|
|
+ gxz[i]*Gamxz[i]+gyz[i]*Gamyz_d[i]+gzz[i]*Gamzz_d[i]
|
|
+ Gamxa[i]*lxzz[i]+Gamya[i]*lyzz[i]+Gamza[i]*lzzz[i]
|
|
+ uxx*(TWO*(Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i])
|
|
+(Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]))
|
|
+ uxy*(TWO*(Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
|
|
+Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i])
|
|
+(Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i])
|
|
+(Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]))
|
|
+ uxz*(TWO*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
|
|
+Gxzz[i]*lxxz[i]+Gyzz[i]*lxyz[i]+Gzzz[i]*lxzz[i])
|
|
+(Gxzz[i]*lxzx[i]+Gyzz[i]*lxzy[i]+Gzzz[i]*lxzz[i])
|
|
+(Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]))
|
|
+ uyy*(TWO*(Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i])
|
|
+(Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]))
|
|
+ uyz*(TWO*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
|
|
+Gxzz[i]*lxyz[i]+Gyzz[i]*lyyz[i]+Gzzz[i]*lyzz[i])
|
|
+(Gxzz[i]*lyzx[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lyzz[i])
|
|
+(Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]))
|
|
+ uzz*(TWO*(Gxzz[i]*lxzz[i]+Gyzz[i]*lyzz[i]+Gzzz[i]*lzzz[i])
|
|
+(Gxzz[i]*lzzx[i]+Gyzz[i]*lzzy[i]+Gzzz[i]*lzzz[i]));
|
|
|
|
Rxy[i] = H*(
|
|
-Rxy[i]
|
|
+gxx[i]*Gamxy[i]+gxy[i]*Gamyy_d[i]+gxz[i]*Gamzy[i]
|
|
+gxy[i]*Gamxx[i]+gyy[i]*Gamyx[i]+gyz[i]*Gamzx[i]
|
|
+Gamxa[i]*lxyx[i]+Gamya[i]*lyyx[i]+Gamza[i]*lyzx[i]
|
|
+Gamxa[i]*lxxy[i]+Gamya[i]*lxyy[i]+Gamza[i]*lxzy[i])
|
|
+uxx*(Gxxx[i]*lxxy[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxzy[i]
|
|
+Gxxy[i]*lxxx[i]+Gyxy[i]*lxyx[i]+Gzxy[i]*lxzx[i]
|
|
+Gxxx[i]*lxyx[i]+Gyxx[i]*lxyy[i]+Gzxx[i]*lxyz[i])
|
|
+uxy*(Gxxx[i]*lxyy[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyzy[i]
|
|
+Gxxy[i]*lxyx[i]+Gyxy[i]*lyyx[i]+Gzxy[i]*lyzx[i]
|
|
+Gxxy[i]*lxyx[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxyz[i]
|
|
+Gxxy[i]*lxxy[i]+Gyxy[i]*lxyy[i]+Gzxy[i]*lxzy[i]
|
|
+Gxyy[i]*lxxx[i]+Gyyy[i]*lxyx[i]+Gzyy[i]*lxzx[i]
|
|
+Gxxx[i]*lyyx[i]+Gyxx[i]*lyyy[i]+Gzxx[i]*lyyz[i])
|
|
+uxz*(Gxxx[i]*lxzy[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lzzy[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lyzx[i]+Gzxy[i]*lzzx[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxyz[i]
|
|
+Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
|
|
+Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
|
|
+Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
|
|
+uyy*(Gxxy[i]*lxyy[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyzy[i]
|
|
+Gxyy[i]*lxyx[i]+Gyyy[i]*lyyx[i]+Gzyy[i]*lyzx[i]
|
|
+Gxxy[i]*lyyx[i]+Gyxy[i]*lyyy[i]+Gzxy[i]*lyyz[i])
|
|
+uyz*(Gxxy[i]*lxzy[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lzzy[i]
|
|
+Gxyy[i]*lxzx[i]+Gyyy[i]*lyzx[i]+Gzyy[i]*lzzx[i]
|
|
+Gxxz[i]*lyyx[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyyz[i]
|
|
+Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
|
|
+Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uzz*(Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
|
|
+Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]);
|
|
|
|
Rxz[i] = H*(
|
|
-Rxz[i]
|
|
+gxx[i]*Gamxz[i]+gxy[i]*Gamyz_d[i]+gxz[i]*Gamzz_d[i]
|
|
+gxz[i]*Gamxx[i]+gyz[i]*Gamyx[i]+gzz[i]*Gamzx[i]
|
|
+Gamxa[i]*lxzx[i]+Gamya[i]*lyzx[i]+Gamza[i]*lzzx[i]
|
|
+Gamxa[i]*lxxz[i]+Gamya[i]*lxyz[i]+Gamza[i]*lxzz[i])
|
|
+uxx*(Gxxx[i]*lxxz[i]+Gyxx[i]*lxyz[i]+Gzxx[i]*lxzz[i]
|
|
+Gxxz[i]*lxxx[i]+Gyxz[i]*lxyx[i]+Gzxz[i]*lxzx[i]
|
|
+Gxxx[i]*lxzx[i]+Gyxx[i]*lxzy[i]+Gzxx[i]*lxzz[i])
|
|
+uxy*(Gxxx[i]*lxyz[i]+Gyxx[i]*lyyz[i]+Gzxx[i]*lyzz[i]
|
|
+Gxxz[i]*lxyx[i]+Gyxz[i]*lyyx[i]+Gzxz[i]*lyzx[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i]
|
|
+Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
|
|
+Gxyz[i]*lxxx[i]+Gyyz[i]*lxyx[i]+Gzyz[i]*lxzx[i]
|
|
+Gxxx[i]*lyzx[i]+Gyxx[i]*lyzy[i]+Gzxx[i]*lyzz[i])
|
|
+uxz*(Gxxx[i]*lxzz[i]+Gyxx[i]*lyzz[i]+Gzxx[i]*lzzz[i]
|
|
+Gxxz[i]*lxzx[i]+Gyxz[i]*lyzx[i]+Gzxz[i]*lzzx[i]
|
|
+Gxxz[i]*lxzx[i]+Gyxz[i]*lxzy[i]+Gzxz[i]*lxzz[i]
|
|
+Gxxz[i]*lxxz[i]+Gyxz[i]*lxyz[i]+Gzxz[i]*lxzz[i]
|
|
+Gxzz[i]*lxxx[i]+Gyzz[i]*lxyx[i]+Gzzz[i]*lxzx[i]
|
|
+Gxxx[i]*lzzx[i]+Gyxx[i]*lzzy[i]+Gzxx[i]*lzzz[i])
|
|
+uyy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
|
|
+Gxyz[i]*lxyx[i]+Gyyz[i]*lyyx[i]+Gzyz[i]*lyzx[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uyz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lyzx[i]+Gzyz[i]*lzzx[i]
|
|
+Gxxz[i]*lyzx[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lyzz[i]
|
|
+Gxxz[i]*lxyz[i]+Gyxz[i]*lyyz[i]+Gzxz[i]*lyzz[i]
|
|
+Gxzz[i]*lxyx[i]+Gyzz[i]*lyyx[i]+Gzzz[i]*lyzx[i]
|
|
+Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
|
|
+uzz*(Gxxz[i]*lxzz[i]+Gyxz[i]*lyzz[i]+Gzxz[i]*lzzz[i]
|
|
+Gxzz[i]*lxzx[i]+Gyzz[i]*lyzx[i]+Gzzz[i]*lzzx[i]
|
|
+Gxxz[i]*lzzx[i]+Gyxz[i]*lzzy[i]+Gzxz[i]*lzzz[i]);
|
|
|
|
Ryz[i] = H*(
|
|
-Ryz[i]
|
|
+gxy[i]*Gamxz[i]+gyy[i]*Gamyz_d[i]+gyz[i]*Gamzz_d[i]
|
|
+gxz[i]*Gamxy[i]+gyz[i]*Gamyy_d[i]+gzz[i]*Gamzy[i]
|
|
+Gamxa[i]*lxzy[i]+Gamya[i]*lyzy[i]+Gamza[i]*lzzy[i]
|
|
+Gamxa[i]*lxyz[i]+Gamya[i]*lyyz[i]+Gamza[i]*lyzz[i])
|
|
+uxx*(Gxxy[i]*lxxz[i]+Gyxy[i]*lxyz[i]+Gzxy[i]*lxzz[i]
|
|
+Gxxz[i]*lxxy[i]+Gyxz[i]*lxyy[i]+Gzxz[i]*lxzy[i]
|
|
+Gxxy[i]*lxzx[i]+Gyxy[i]*lxzy[i]+Gzxy[i]*lxzz[i])
|
|
+uxy*(Gxxy[i]*lxyz[i]+Gyxy[i]*lyyz[i]+Gzxy[i]*lyzz[i]
|
|
+Gxxz[i]*lxyy[i]+Gyxz[i]*lyyy[i]+Gzxz[i]*lyzy[i]
|
|
+Gxyy[i]*lxzx[i]+Gyyy[i]*lxzy[i]+Gzyy[i]*lxzz[i]
|
|
+Gxyy[i]*lxxz[i]+Gyyy[i]*lxyz[i]+Gzyy[i]*lxzz[i]
|
|
+Gxyz[i]*lxxy[i]+Gyyz[i]*lxyy[i]+Gzyz[i]*lxzy[i]
|
|
+Gxxy[i]*lyzx[i]+Gyxy[i]*lyzy[i]+Gzxy[i]*lyzz[i])
|
|
+uxz*(Gxxy[i]*lxzz[i]+Gyxy[i]*lyzz[i]+Gzxy[i]*lzzz[i]
|
|
+Gxxz[i]*lxzy[i]+Gyxz[i]*lyzy[i]+Gzxz[i]*lzzy[i]
|
|
+Gxyz[i]*lxzx[i]+Gyyz[i]*lxzy[i]+Gzyz[i]*lxzz[i]
|
|
+Gxyz[i]*lxxz[i]+Gyyz[i]*lxyz[i]+Gzyz[i]*lxzz[i]
|
|
+Gxzz[i]*lxxy[i]+Gyzz[i]*lxyy[i]+Gzzz[i]*lxzy[i]
|
|
+Gxxy[i]*lzzx[i]+Gyxy[i]*lzzy[i]+Gzxy[i]*lzzz[i])
|
|
+uyy*(Gxyy[i]*lxyz[i]+Gyyy[i]*lyyz[i]+Gzyy[i]*lyzz[i]
|
|
+Gxyz[i]*lxyy[i]+Gyyz[i]*lyyy[i]+Gzyz[i]*lyzy[i]
|
|
+Gxyy[i]*lyzx[i]+Gyyy[i]*lyzy[i]+Gzyy[i]*lyzz[i])
|
|
+uyz*(Gxyy[i]*lxzz[i]+Gyyy[i]*lyzz[i]+Gzyy[i]*lzzz[i]
|
|
+Gxyz[i]*lxzy[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lzzy[i]
|
|
+Gxyz[i]*lyzx[i]+Gyyz[i]*lyzy[i]+Gzyz[i]*lyzz[i]
|
|
+Gxyz[i]*lxyz[i]+Gyyz[i]*lyyz[i]+Gzyz[i]*lyzz[i]
|
|
+Gxzz[i]*lxyy[i]+Gyzz[i]*lyyy[i]+Gzzz[i]*lyzy[i]
|
|
+Gxyy[i]*lzzx[i]+Gyyy[i]*lzzy[i]+Gzyy[i]*lzzz[i])
|
|
+uzz*(Gxyz[i]*lxzz[i]+Gyyz[i]*lyzz[i]+Gzyz[i]*lzzz[i]
|
|
+Gxzz[i]*lxzy[i]+Gyzz[i]*lyzy[i]+Gzzz[i]*lzzy[i]
|
|
+Gxyz[i]*lzzx[i]+Gyyz[i]*lzzy[i]+Gzyz[i]*lzzz[i]);
|
|
}
|
|
}
|
|
|
|
/* Phase 13: chi correction to Ricci tensor
|
|
* After fdderivs(chi), subtract Christoffel*chi_deriv, compute conformal factor f,
|
|
* then add chi contribution to Rxx..Rzz.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase12_13_chi_correction_fused(
|
|
const double* __restrict__ chi,
|
|
const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
double* __restrict__ Rxx, double* __restrict__ Rxy,
|
|
double* __restrict__ Rxz, double* __restrict__ Ryy,
|
|
double* __restrict__ Ryz, double* __restrict__ Rzz)
|
|
{
|
|
const double TWO = 2.0;
|
|
const double F3o2 = 1.5;
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (tid >= d_gp.all) return;
|
|
|
|
const int i0 = tid % nx;
|
|
const int j0 = (tid / nx) % ny;
|
|
const int k0 = tid / (nx * ny);
|
|
|
|
double cxx = 0.0, cxy = 0.0, cxz = 0.0;
|
|
double cyy = 0.0, cyz = 0.0, czz = 0.0;
|
|
|
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
|
|
#if ghost_width != 3
|
|
fd_compute_second6(chi, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
1, 1, 1,
|
|
cxx, cxy, cxz, cyy, cyz, czz);
|
|
#else
|
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
|
|
cxx = d_gp.Fdxdx * (
|
|
-fetch_sym_ord2_direct(chi, iF - 2, jF, kF, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF, 1, 1, 1));
|
|
cyy = d_gp.Fdydy * (
|
|
-fetch_sym_ord2_direct(chi, iF, jF - 2, kF, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF, 1, 1, 1));
|
|
czz = d_gp.Fdzdz * (
|
|
-fetch_sym_ord2_direct(chi, iF, jF, kF - 2, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF, kF + 2, 1, 1, 1));
|
|
|
|
const double t_jm2 =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF - 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 2, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 2, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF - 2, kF, 1, 1, 1);
|
|
const double t_jm1 =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF - 1, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF - 1, kF, 1, 1, 1);
|
|
const double t_jp1 =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF + 1, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF + 1, kF, 1, 1, 1);
|
|
const double t_jp2 =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF + 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF + 2, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF + 2, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF + 2, kF, 1, 1, 1);
|
|
cxy = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
|
|
|
const double t_km2_x =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 2, 1, 1, 1);
|
|
const double t_km1_x =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF - 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF - 1, 1, 1, 1);
|
|
const double t_kp1_x =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 1, 1, 1, 1);
|
|
const double t_kp2_x =
|
|
fetch_sym_ord2_direct(chi, iF - 2, jF, kF + 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 2, jF, kF + 2, 1, 1, 1);
|
|
cxz = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
|
|
|
const double t_km2_y =
|
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 2, 1, 1, 1);
|
|
const double t_km1_y =
|
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF - 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF - 1, 1, 1, 1);
|
|
const double t_kp1_y =
|
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 1, 1, 1, 1);
|
|
const double t_kp2_y =
|
|
fetch_sym_ord2_direct(chi, iF, jF - 2, kF + 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 2, kF + 2, 1, 1, 1);
|
|
cyz = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
|
}
|
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(chi, iF, jF, kF, 1, 1, 1);
|
|
cxx = d_gp.Sdxdx * (
|
|
fetch_sym_ord2_direct(chi, iF - 1, jF, kF, 1, 1, 1)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF, 1, 1, 1));
|
|
cyy = d_gp.Sdydy * (
|
|
fetch_sym_ord2_direct(chi, iF, jF - 1, kF, 1, 1, 1)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF, 1, 1, 1));
|
|
czz = d_gp.Sdzdz * (
|
|
fetch_sym_ord2_direct(chi, iF, jF, kF - 1, 1, 1, 1)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(chi, iF, jF, kF + 1, 1, 1, 1));
|
|
cxy = d_gp.Sdxdy * (
|
|
fetch_sym_ord2_direct(chi, iF - 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF - 1, jF + 1, kF, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(chi, iF + 1, jF + 1, kF, 1, 1, 1));
|
|
cxz = d_gp.Sdxdz * (
|
|
fetch_sym_ord2_direct(chi, iF - 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF + 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF - 1, jF, kF + 1, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(chi, iF + 1, jF, kF + 1, 1, 1, 1));
|
|
cyz = d_gp.Sdydz * (
|
|
fetch_sym_ord2_direct(chi, iF, jF - 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF + 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(chi, iF, jF - 1, kF + 1, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(chi, iF, jF + 1, kF + 1, 1, 1, 1));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
const double cx = chix[tid];
|
|
const double cy = chiy[tid];
|
|
const double cz = chiz[tid];
|
|
const double c1 = chin1[tid];
|
|
|
|
cxx -= Gxxx[tid] * cx + Gyxx[tid] * cy + Gzxx[tid] * cz;
|
|
cxy -= Gxxy[tid] * cx + Gyxy[tid] * cy + Gzxy[tid] * cz;
|
|
cxz -= Gxxz[tid] * cx + Gyxz[tid] * cy + Gzxz[tid] * cz;
|
|
cyy -= Gxyy[tid] * cx + Gyyy[tid] * cy + Gzyy[tid] * cz;
|
|
cyz -= Gxyz[tid] * cx + Gyyz[tid] * cy + Gzyz[tid] * cz;
|
|
czz -= Gxzz[tid] * cx + Gyzz[tid] * cy + Gzzz[tid] * cz;
|
|
|
|
const double uxx = gupxx[tid], uxy = gupxy[tid], uxz = gupxz[tid];
|
|
const double uyy = gupyy[tid], uyz = gupyz[tid], uzz = gupzz[tid];
|
|
const double f_val = uxx * (cxx - F3o2 / c1 * cx * cx)
|
|
+ uyy * (cyy - F3o2 / c1 * cy * cy)
|
|
+ uzz * (czz - F3o2 / c1 * cz * cz)
|
|
+ TWO * uxy * (cxy - F3o2 / c1 * cx * cy)
|
|
+ TWO * uxz * (cxz - F3o2 / c1 * cx * cz)
|
|
+ TWO * uyz * (cyz - F3o2 / c1 * cy * cz);
|
|
|
|
const double inv2c = 1.0 / (c1 * TWO);
|
|
Rxx[tid] += (cxx - cx * cx * inv2c + gxx[tid] * f_val) * inv2c;
|
|
Ryy[tid] += (cyy - cy * cy * inv2c + gyy[tid] * f_val) * inv2c;
|
|
Rzz[tid] += (czz - cz * cz * inv2c + gzz[tid] * f_val) * inv2c;
|
|
Rxy[tid] += (cxy - cx * cy * inv2c + gxy[tid] * f_val) * inv2c;
|
|
Rxz[tid] += (cxz - cx * cz * inv2c + gxz[tid] * f_val) * inv2c;
|
|
Ryz[tid] += (cyz - cy * cz * inv2c + gyz[tid] * f_val) * inv2c;
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase13_chi_correction(
|
|
const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
double* __restrict__ fxx, double* __restrict__ fxy,
|
|
double* __restrict__ fxz, double* __restrict__ fyy,
|
|
double* __restrict__ fyz, double* __restrict__ fzz,
|
|
double* __restrict__ Rxx, double* __restrict__ Rxy,
|
|
double* __restrict__ Rxz, double* __restrict__ Ryy,
|
|
double* __restrict__ Ryz, double* __restrict__ Rzz)
|
|
{
|
|
const double H=0.5, TWO=2.0, F3o2=1.5;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i],c1=chin1[i];
|
|
/* subtract Christoffel * chi_deriv */
|
|
fxx[i] -= Gxxx[i]*cx+Gyxx[i]*cy+Gzxx[i]*cz;
|
|
fxy[i] -= Gxxy[i]*cx+Gyxy[i]*cy+Gzxy[i]*cz;
|
|
fxz[i] -= Gxxz[i]*cx+Gyxz[i]*cy+Gzxz[i]*cz;
|
|
fyy[i] -= Gxyy[i]*cx+Gyyy[i]*cy+Gzyy[i]*cz;
|
|
fyz[i] -= Gxyz[i]*cx+Gyyz[i]*cy+Gzyz[i]*cz;
|
|
fzz[i] -= Gxzz[i]*cx+Gyzz[i]*cy+Gzzz[i]*cz;
|
|
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double f_val = uxx*(fxx[i]-F3o2/c1*cx*cx)
|
|
+ uyy*(fyy[i]-F3o2/c1*cy*cy)
|
|
+ uzz*(fzz[i]-F3o2/c1*cz*cz)
|
|
+ TWO*uxy*(fxy[i]-F3o2/c1*cx*cy)
|
|
+ TWO*uxz*(fxz[i]-F3o2/c1*cx*cz)
|
|
+ TWO*uyz*(fyz[i]-F3o2/c1*cy*cz);
|
|
|
|
double inv2c = 1.0/(c1*TWO);
|
|
Rxx[i] += (fxx[i]-cx*cx*inv2c+gxx[i]*f_val)*inv2c;
|
|
Ryy[i] += (fyy[i]-cy*cy*inv2c+gyy[i]*f_val)*inv2c;
|
|
Rzz[i] += (fzz[i]-cz*cz*inv2c+gzz[i]*f_val)*inv2c;
|
|
Rxy[i] += (fxy[i]-cx*cy*inv2c+gxy[i]*f_val)*inv2c;
|
|
Rxz[i] += (fxz[i]-cx*cz*inv2c+gxz[i]*f_val)*inv2c;
|
|
Ryz[i] += (fyz[i]-cy*cz*inv2c+gyz[i]*f_val)*inv2c;
|
|
}
|
|
}
|
|
|
|
/* Phase 15: trK_rhs, Aij_rhs, gauge.
|
|
* Also updates Christoffel with physical chi correction and computes Lap second derivatives on the fly.
|
|
*/
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase15_trK_Aij_gauge(
|
|
const double* __restrict__ alpn1, const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gxx, const double* __restrict__ gxy,
|
|
const double* __restrict__ gxz, const double* __restrict__ gyy,
|
|
const double* __restrict__ gyz, const double* __restrict__ gzz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ Lapx, const double* __restrict__ Lapy,
|
|
const double* __restrict__ Lapz,
|
|
const double* __restrict__ betaxx, const double* __restrict__ betaxy,
|
|
const double* __restrict__ betaxz, const double* __restrict__ betayx,
|
|
const double* __restrict__ betayy, const double* __restrict__ betayz,
|
|
const double* __restrict__ betazx, const double* __restrict__ betazy,
|
|
const double* __restrict__ betazz,
|
|
const double* __restrict__ rho,
|
|
const double* __restrict__ Sx_m, const double* __restrict__ Sy_m,
|
|
const double* __restrict__ Sz_m,
|
|
const double* __restrict__ Sxx_m, const double* __restrict__ Sxy_m,
|
|
const double* __restrict__ Sxz_m, const double* __restrict__ Syy_m,
|
|
const double* __restrict__ Syz_m, const double* __restrict__ Szz_m,
|
|
const double* __restrict__ dtSfx, const double* __restrict__ dtSfy,
|
|
const double* __restrict__ dtSfz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
double* __restrict__ Gxxx, double* __restrict__ Gxxy,
|
|
double* __restrict__ Gxxz, double* __restrict__ Gxyy,
|
|
double* __restrict__ Gxyz_o, double* __restrict__ Gxzz,
|
|
double* __restrict__ Gyxx, double* __restrict__ Gyxy,
|
|
double* __restrict__ Gyxz, double* __restrict__ Gyyy,
|
|
double* __restrict__ Gyyz, double* __restrict__ Gyzz,
|
|
double* __restrict__ Gzxx, double* __restrict__ Gzxy,
|
|
double* __restrict__ Gzxz, double* __restrict__ Gzyy,
|
|
double* __restrict__ Gzyz, double* __restrict__ Gzzz,
|
|
double* __restrict__ dtSfx_rhs, double* __restrict__ dtSfy_rhs,
|
|
double* __restrict__ dtSfz_rhs,
|
|
double* __restrict__ trK_rhs,
|
|
double* __restrict__ Axx_rhs, double* __restrict__ Axy_rhs,
|
|
double* __restrict__ Axz_rhs, double* __restrict__ Ayy_rhs,
|
|
double* __restrict__ Ayz_rhs, double* __restrict__ Azz_rhs,
|
|
double* __restrict__ Lap_rhs,
|
|
double* __restrict__ betax_rhs, double* __restrict__ betay_rhs,
|
|
double* __restrict__ betaz_rhs,
|
|
double* __restrict__ Gamx_rhs, double* __restrict__ Gamy_rhs,
|
|
double* __restrict__ Gamz_rhs,
|
|
double* __restrict__ f_arr, double* __restrict__ S_arr)
|
|
{
|
|
const double TWO=2.0, FOUR=4.0, EIGHT=8.0, H=0.5;
|
|
const double F1o3=1.0/3.0, F2o3=2.0/3.0, F3o2=1.5;
|
|
const double PI_V=3.14159265358979323846;
|
|
const double F16=16.0, F8=8.0;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all; i += blockDim.x*gridDim.x) {
|
|
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
|
const int i0 = i % nx;
|
|
const int j0 = (i / nx) % ny;
|
|
const int k0 = i / (nx * ny);
|
|
const int iF = i0 + 1;
|
|
const int jF = j0 + 1;
|
|
const int kF = k0 + 1;
|
|
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
|
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double a=alpn1[i], c1=chin1[i];
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i];
|
|
double lx=Lapx[i],ly=Lapy[i],lz=Lapz[i];
|
|
double fxx_v = 0.0, fxy_v = 0.0, fxz_v = 0.0;
|
|
double fyy_v = 0.0, fyz_v = 0.0, fzz_v = 0.0;
|
|
|
|
if (!(i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2)) {
|
|
#if ghost_width != 3
|
|
fd_compute_second6(alpn1, iF, jF, kF,
|
|
iminF, jminF, kminF, imaxF, jmaxF, kmaxF,
|
|
1, 1, 1,
|
|
fxx_v, fxy_v, fxz_v, fyy_v, fyz_v, fzz_v);
|
|
#else
|
|
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
|
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
|
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1);
|
|
fxx_v = d_gp.Fdxdx * (
|
|
-fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF, 1, 1, 1));
|
|
fyy_v = d_gp.Fdydy * (
|
|
-fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF, 1, 1, 1));
|
|
fzz_v = d_gp.Fdzdz * (
|
|
-fetch_sym_ord2_direct(alpn1, iF, jF, kF - 2, 1, 1, 1)
|
|
+16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1)
|
|
-30.0 * c
|
|
+16.0 * fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF, kF + 2, 1, 1, 1));
|
|
|
|
const double t_jm2 =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF - 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 2, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 2, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF - 2, kF, 1, 1, 1);
|
|
const double t_jm1 =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF - 1, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF - 1, kF, 1, 1, 1);
|
|
const double t_jp1 =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF + 1, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF + 1, kF, 1, 1, 1);
|
|
const double t_jp2 =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF + 2, kF, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF + 2, kF, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF + 2, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF + 2, kF, 1, 1, 1);
|
|
fxy_v = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
|
|
|
const double t_km2_x =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 2, 1, 1, 1);
|
|
const double t_km1_x =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF - 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF - 1, 1, 1, 1);
|
|
const double t_kp1_x =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 1, 1, 1, 1);
|
|
const double t_kp2_x =
|
|
fetch_sym_ord2_direct(alpn1, iF - 2, jF, kF + 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 2, jF, kF + 2, 1, 1, 1);
|
|
fxz_v = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
|
|
|
const double t_km2_y =
|
|
fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 2, 1, 1, 1);
|
|
const double t_km1_y =
|
|
fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF - 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF - 1, 1, 1, 1);
|
|
const double t_kp1_y =
|
|
fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 1, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 1, 1, 1, 1);
|
|
const double t_kp2_y =
|
|
fetch_sym_ord2_direct(alpn1, iF, jF - 2, kF + 2, 1, 1, 1)
|
|
- 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 2, 1, 1, 1)
|
|
+ 8.0 * fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 2, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF + 2, kF + 2, 1, 1, 1);
|
|
fyz_v = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
|
}
|
|
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
|
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
|
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
|
{
|
|
const double c = fetch_sym_ord2_direct(alpn1, iF, jF, kF, 1, 1, 1);
|
|
fxx_v = d_gp.Sdxdx * (
|
|
fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF, 1, 1, 1)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF, 1, 1, 1));
|
|
fyy_v = d_gp.Sdydy * (
|
|
fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF, 1, 1, 1)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF, 1, 1, 1));
|
|
fzz_v = d_gp.Sdzdz * (
|
|
fetch_sym_ord2_direct(alpn1, iF, jF, kF - 1, 1, 1, 1)
|
|
- 2.0 * c
|
|
+ fetch_sym_ord2_direct(alpn1, iF, jF, kF + 1, 1, 1, 1));
|
|
fxy_v = d_gp.Sdxdy * (
|
|
fetch_sym_ord2_direct(alpn1, iF - 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 1, jF - 1, kF, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF - 1, jF + 1, kF, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(alpn1, iF + 1, jF + 1, kF, 1, 1, 1));
|
|
fxz_v = d_gp.Sdxdz * (
|
|
fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF - 1, jF, kF + 1, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(alpn1, iF + 1, jF, kF + 1, 1, 1, 1));
|
|
fyz_v = d_gp.Sdydz * (
|
|
fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF - 1, 1, 1, 1)
|
|
- fetch_sym_ord2_direct(alpn1, iF, jF - 1, kF + 1, 1, 1, 1)
|
|
+ fetch_sym_ord2_direct(alpn1, iF, jF + 1, kF + 1, 1, 1, 1));
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/* raised chi/chi */
|
|
double gx=(uxx*cx+uxy*cy+uxz*cz)/c1;
|
|
double gy=(uxy*cx+uyy*cy+uyz*cz)/c1;
|
|
double gz=(uxz*cx+uyz*cy+uzz*cz)/c1;
|
|
|
|
/* Christoffel physical correction */
|
|
Gxxx[i]-=((cx+cx)/c1-gxx[i]*gx)*H;
|
|
Gyxx[i]-=(0.0-gxx[i]*gy)*H;
|
|
Gzxx[i]-=(0.0-gxx[i]*gz)*H;
|
|
Gxyy[i]-=(0.0-gyy[i]*gx)*H;
|
|
Gyyy[i]-=((cy+cy)/c1-gyy[i]*gy)*H;
|
|
Gzyy[i]-=(0.0-gyy[i]*gz)*H;
|
|
Gxzz[i]-=(0.0-gzz[i]*gx)*H;
|
|
Gyzz[i]-=(0.0-gzz[i]*gy)*H;
|
|
Gzzz[i]-=((cz+cz)/c1-gzz[i]*gz)*H;
|
|
Gxxy[i]-=(cy/c1-gxy[i]*gx)*H;
|
|
Gyxy[i]-=(cx/c1-gxy[i]*gy)*H;
|
|
Gzxy[i]-=(0.0-gxy[i]*gz)*H;
|
|
Gxxz[i]-=(cz/c1-gxz[i]*gx)*H;
|
|
Gyxz[i]-=(0.0-gxz[i]*gy)*H;
|
|
Gzxz[i]-=(cx/c1-gxz[i]*gz)*H;
|
|
Gxyz_o[i]-=(0.0-gyz[i]*gx)*H;
|
|
Gyyz[i]-=(cz/c1-gyz[i]*gy)*H;
|
|
Gzyz[i]-=(cy/c1-gyz[i]*gz)*H;
|
|
|
|
/* Lap second-derivative correction: subtract Gamma*Lap_deriv */
|
|
fxx_v -= Gxxx[i]*lx+Gyxx[i]*ly+Gzxx[i]*lz;
|
|
fyy_v -= Gxyy[i]*lx+Gyyy[i]*ly+Gzyy[i]*lz;
|
|
fzz_v -= Gxzz[i]*lx+Gyzz[i]*ly+Gzzz[i]*lz;
|
|
fxy_v -= Gxxy[i]*lx+Gyxy[i]*ly+Gzxy[i]*lz;
|
|
fxz_v -= Gxxz[i]*lx+Gyxz[i]*ly+Gzxz[i]*lz;
|
|
fyz_v -= Gxyz_o[i]*lx+Gyyz[i]*ly+Gzyz[i]*lz;
|
|
|
|
/* D^i D_i alpha */
|
|
double DDA = uxx*fxx_v+uyy*fyy_v+uzz*fzz_v
|
|
+TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v);
|
|
|
|
/* trace of S_ij (physical) */
|
|
double S_v = c1*(uxx*Sxx_m[i]+uyy*Syy_m[i]+uzz*Szz_m[i]
|
|
+TWO*(uxy*Sxy_m[i]+uxz*Sxz_m[i]+uyz*Syz_m[i]));
|
|
|
|
/* A^ij A_ij */
|
|
double AijAij =
|
|
uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
|
|
+TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]))
|
|
+uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
|
|
+TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]))
|
|
+uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
|
|
+TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]))
|
|
+TWO*(
|
|
uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
|
|
+uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
|
|
+uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
|
|
+uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]))
|
|
+uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
|
|
+uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
|
|
+uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
|
|
+uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]))
|
|
+uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
|
|
+uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
|
|
+uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
|
|
+uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i])));
|
|
|
|
double trK_v = trK[i];
|
|
double db = betaxx[i] + betayy[i] + betazz[i];
|
|
|
|
/* trK_rhs step 1: store D^iD_i alpha * chin1 */
|
|
trK_rhs[i] = c1 * DDA;
|
|
|
|
/* f_arr = -(1/3) * (DDA + alpha/chi * (2/3*K^2 - AijAij - 16pi*rho + 8pi*S)) */
|
|
double f_v = F2o3*trK_v*trK_v - AijAij - F16*PI_V*rho[i] + EIGHT*PI_V*S_v;
|
|
f_arr[i] = -F1o3*(uxx*fxx_v+uyy*fyy_v+uzz*fzz_v
|
|
+TWO*(uxy*fxy_v+uxz*fxz_v+uyz*fyz_v)
|
|
+(a/c1)*f_v);
|
|
|
|
/* fij = alpha*(Rij - 8pi*Sij) - D_iD_j alpha */
|
|
double fij_xx=a*(Rxx[i]-EIGHT*PI_V*Sxx_m[i])-fxx_v;
|
|
double fij_xy=a*(Rxy[i]-EIGHT*PI_V*Sxy_m[i])-fxy_v;
|
|
double fij_xz=a*(Rxz[i]-EIGHT*PI_V*Sxz_m[i])-fxz_v;
|
|
double fij_yy=a*(Ryy[i]-EIGHT*PI_V*Syy_m[i])-fyy_v;
|
|
double fij_yz=a*(Ryz[i]-EIGHT*PI_V*Syz_m[i])-fyz_v;
|
|
double fij_zz=a*(Rzz[i]-EIGHT*PI_V*Szz_m[i])-fzz_v;
|
|
|
|
/* Aij_rhs = chi*(fij - gij*f) */
|
|
Axx_rhs[i]=fij_xx-gxx[i]*f_arr[i];
|
|
Ayy_rhs[i]=fij_yy-gyy[i]*f_arr[i];
|
|
Azz_rhs[i]=fij_zz-gzz[i]*f_arr[i];
|
|
Axy_rhs[i]=fij_xy-gxy[i]*f_arr[i];
|
|
Axz_rhs[i]=fij_xz-gxz[i]*f_arr[i];
|
|
Ayz_rhs[i]=fij_yz-gyz[i]*f_arr[i];
|
|
|
|
/* A_il A^l_j */
|
|
double AA_xx=uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
|
|
+TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]);
|
|
double AA_yy=uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
|
|
+TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]);
|
|
double AA_zz=uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
|
|
+TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]);
|
|
double AA_xy=uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
|
|
+uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
|
|
+uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
|
|
+uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]);
|
|
double AA_xz=uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
|
|
+uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
|
|
+uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
|
|
+uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]);
|
|
double AA_yz=uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
|
|
+uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
|
|
+uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
|
|
+uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i]);
|
|
|
|
/* trK_rhs final */
|
|
trK_rhs[i] = -trK_rhs[i]
|
|
+ a*(F1o3*trK_v*trK_v
|
|
+uxx*AA_xx+uyy*AA_yy+uzz*AA_zz
|
|
+TWO*(uxy*AA_xy+uxz*AA_xz+uyz*AA_yz)
|
|
+FOUR*PI_V*(rho[i]+S_v));
|
|
|
|
/* Aij_rhs final */
|
|
Axx_rhs[i]=c1*Axx_rhs[i]+a*(trK_v*Axx[i]-TWO*AA_xx)
|
|
+TWO*(Axx[i]*betaxx[i]+Axy[i]*betayx[i]+Axz[i]*betazx[i])-F2o3*Axx[i]*db;
|
|
Ayy_rhs[i]=c1*Ayy_rhs[i]+a*(trK_v*Ayy[i]-TWO*AA_yy)
|
|
+TWO*(Axy[i]*betaxy[i]+Ayy[i]*betayy[i]+Ayz[i]*betazy[i])-F2o3*Ayy[i]*db;
|
|
Azz_rhs[i]=c1*Azz_rhs[i]+a*(trK_v*Azz[i]-TWO*AA_zz)
|
|
+TWO*(Axz[i]*betaxz[i]+Ayz[i]*betayz[i]+Azz[i]*betazz[i])-F2o3*Azz[i]*db;
|
|
Axy_rhs[i]=c1*Axy_rhs[i]+a*(trK_v*Axy[i]-TWO*AA_xy)
|
|
+Axx[i]*betaxy[i]+Axz[i]*betazy[i]+Ayy[i]*betayx[i]
|
|
+Ayz[i]*betazx[i]+F1o3*Axy[i]*db-Axy[i]*betazz[i];
|
|
Ayz_rhs[i]=c1*Ayz_rhs[i]+a*(trK_v*Ayz[i]-TWO*AA_yz)
|
|
+Axy[i]*betaxz[i]+Ayy[i]*betayz[i]+Axz[i]*betaxy[i]
|
|
+Azz[i]*betazy[i]+F1o3*Ayz[i]*db-Ayz[i]*betaxx[i];
|
|
Axz_rhs[i]=c1*Axz_rhs[i]+a*(trK_v*Axz[i]-TWO*AA_xz)
|
|
+Axx[i]*betaxz[i]+Axy[i]*betayz[i]+Ayz[i]*betayx[i]
|
|
+Azz[i]*betazx[i]+F1o3*Axz[i]*db-Axz[i]*betayy[i];
|
|
|
|
/* gauge */
|
|
Lap_rhs[i] = -TWO*a*trK_v;
|
|
betax_rhs[i] = 0.75*dtSfx[i];
|
|
betay_rhs[i] = 0.75*dtSfy[i];
|
|
betaz_rhs[i] = 0.75*dtSfz[i];
|
|
#if (GAUGE == 0)
|
|
dtSfx_rhs[i] = Gamx_rhs[i] - 2.0*dtSfx[i];
|
|
dtSfy_rhs[i] = Gamy_rhs[i] - 2.0*dtSfy[i];
|
|
dtSfz_rhs[i] = Gamz_rhs[i] - 2.0*dtSfz[i];
|
|
#endif
|
|
}
|
|
}
|
|
|
|
/* Phase 18: Hamilton & momentum constraints (co==0 only) */
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_phase18_constraints(
|
|
const double* __restrict__ chin1,
|
|
const double* __restrict__ chix, const double* __restrict__ chiy,
|
|
const double* __restrict__ chiz,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
const double* __restrict__ rho,
|
|
const double* __restrict__ Sx_m, const double* __restrict__ Sy_m,
|
|
const double* __restrict__ Sz_m,
|
|
const double* __restrict__ Kx, const double* __restrict__ Ky,
|
|
const double* __restrict__ Kz,
|
|
const double* __restrict__ Gxxx, const double* __restrict__ Gxxy,
|
|
const double* __restrict__ Gxxz, const double* __restrict__ Gxyy,
|
|
const double* __restrict__ Gxyz, const double* __restrict__ Gxzz,
|
|
const double* __restrict__ Gyxx, const double* __restrict__ Gyxy,
|
|
const double* __restrict__ Gyxz, const double* __restrict__ Gyyy,
|
|
const double* __restrict__ Gyyz, const double* __restrict__ Gyzz,
|
|
const double* __restrict__ Gzxx, const double* __restrict__ Gzxy,
|
|
const double* __restrict__ Gzxz, const double* __restrict__ Gzyy,
|
|
const double* __restrict__ Gzyz, const double* __restrict__ Gzzz,
|
|
/* dA/dx arrays (fderivs of Aij) */
|
|
const double* __restrict__ dAxx_x, const double* __restrict__ dAxx_y,
|
|
const double* __restrict__ dAxx_z,
|
|
const double* __restrict__ dAxy_x, const double* __restrict__ dAxy_y,
|
|
const double* __restrict__ dAxy_z,
|
|
const double* __restrict__ dAxz_x, const double* __restrict__ dAxz_y,
|
|
const double* __restrict__ dAxz_z,
|
|
const double* __restrict__ dAyy_x, const double* __restrict__ dAyy_y,
|
|
const double* __restrict__ dAyy_z,
|
|
const double* __restrict__ dAyz_x, const double* __restrict__ dAyz_y,
|
|
const double* __restrict__ dAyz_z,
|
|
const double* __restrict__ dAzz_x, const double* __restrict__ dAzz_y,
|
|
const double* __restrict__ dAzz_z,
|
|
double* __restrict__ ham_Res,
|
|
double* __restrict__ movx_Res, double* __restrict__ movy_Res,
|
|
double* __restrict__ movz_Res)
|
|
{
|
|
const double TWO=2.0, F2o3=2.0/3.0, F8=8.0, F16=16.0;
|
|
const double PI_V=3.14159265358979323846;
|
|
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < d_gp.all;
|
|
i += blockDim.x*gridDim.x)
|
|
{
|
|
double uxx=gupxx[i],uxy=gupxy[i],uxz=gupxz[i];
|
|
double uyy=gupyy[i],uyz=gupyz[i],uzz=gupzz[i];
|
|
double c1=chin1[i];
|
|
|
|
/* Hamiltonian constraint */
|
|
double R_sc = uxx*Rxx[i]+uyy*Ryy[i]+uzz*Rzz[i]
|
|
+TWO*(uxy*Rxy[i]+uxz*Rxz[i]+uyz*Ryz[i]);
|
|
/* AijAij (same as in phase15) */
|
|
double AijAij =
|
|
uxx*(uxx*Axx[i]*Axx[i]+uyy*Axy[i]*Axy[i]+uzz*Axz[i]*Axz[i]
|
|
+TWO*(uxy*Axx[i]*Axy[i]+uxz*Axx[i]*Axz[i]+uyz*Axy[i]*Axz[i]))
|
|
+uyy*(uxx*Axy[i]*Axy[i]+uyy*Ayy[i]*Ayy[i]+uzz*Ayz[i]*Ayz[i]
|
|
+TWO*(uxy*Axy[i]*Ayy[i]+uxz*Axy[i]*Ayz[i]+uyz*Ayy[i]*Ayz[i]))
|
|
+uzz*(uxx*Axz[i]*Axz[i]+uyy*Ayz[i]*Ayz[i]+uzz*Azz[i]*Azz[i]
|
|
+TWO*(uxy*Axz[i]*Ayz[i]+uxz*Axz[i]*Azz[i]+uyz*Ayz[i]*Azz[i]))
|
|
+TWO*(uxy*(uxx*Axx[i]*Axy[i]+uyy*Axy[i]*Ayy[i]+uzz*Axz[i]*Ayz[i]
|
|
+uxy*(Axx[i]*Ayy[i]+Axy[i]*Axy[i])
|
|
+uxz*(Axx[i]*Ayz[i]+Axz[i]*Axy[i])
|
|
+uyz*(Axy[i]*Ayz[i]+Axz[i]*Ayy[i]))
|
|
+uxz*(uxx*Axx[i]*Axz[i]+uyy*Axy[i]*Ayz[i]+uzz*Axz[i]*Azz[i]
|
|
+uxy*(Axx[i]*Ayz[i]+Axy[i]*Axz[i])
|
|
+uxz*(Axx[i]*Azz[i]+Axz[i]*Axz[i])
|
|
+uyz*(Axy[i]*Azz[i]+Axz[i]*Ayz[i]))
|
|
+uyz*(uxx*Axy[i]*Axz[i]+uyy*Ayy[i]*Ayz[i]+uzz*Ayz[i]*Azz[i]
|
|
+uxy*(Axy[i]*Ayz[i]+Ayy[i]*Axz[i])
|
|
+uxz*(Axy[i]*Azz[i]+Ayz[i]*Axz[i])
|
|
+uyz*(Ayy[i]*Azz[i]+Ayz[i]*Ayz[i])));
|
|
|
|
ham_Res[i] = c1*R_sc + F2o3*trK[i]*trK[i] - AijAij - F16*PI_V*rho[i];
|
|
|
|
/* Momentum constraints: need covariant derivative of A */
|
|
double cx=chix[i],cy=chiy[i],cz=chiz[i];
|
|
/* D_j A^j_x etc — subtract Christoffel and chi terms */
|
|
/* gxxx = dAxx_x - 2*Gxxx*Axx - ... - chix*Axx/chin1 etc */
|
|
double mx_xx = dAxx_x[i]-(Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i]
|
|
+Gxxx[i]*Axx[i]+Gyxx[i]*Axy[i]+Gzxx[i]*Axz[i])-cx*Axx[i]/c1;
|
|
double mx_xy = dAxy_x[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i]
|
|
+Gxxx[i]*Axy[i]+Gyxx[i]*Ayy[i]+Gzxx[i]*Ayz[i])-cx*Axy[i]/c1;
|
|
double mx_xz = dAxz_x[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i]
|
|
+Gxxx[i]*Axz[i]+Gyxx[i]*Ayz[i]+Gzxx[i]*Azz[i])-cx*Axz[i]/c1;
|
|
double mx_yy = dAyy_x[i]-(Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i]
|
|
+Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cx*Ayy[i]/c1;
|
|
double mx_yz = dAyz_x[i]-(Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i]
|
|
+Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cx*Ayz[i]/c1;
|
|
double mx_zz = dAzz_x[i]-(Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i]
|
|
+Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cx*Azz[i]/c1;
|
|
|
|
double my_xx = dAxx_y[i]-(Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i]
|
|
+Gxxy[i]*Axx[i]+Gyxy[i]*Axy[i]+Gzxy[i]*Axz[i])-cy*Axx[i]/c1;
|
|
double my_xy = dAxy_y[i]-(Gxyy[i]*Axx[i]+Gyyy[i]*Axy[i]+Gzyy[i]*Axz[i]
|
|
+Gxxy[i]*Axy[i]+Gyxy[i]*Ayy[i]+Gzxy[i]*Ayz[i])-cy*Axy[i]/c1;
|
|
double my_xz = dAxz_y[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i]
|
|
+Gxxy[i]*Axz[i]+Gyxy[i]*Ayz[i]+Gzxy[i]*Azz[i])-cy*Axz[i]/c1;
|
|
double my_yy = dAyy_y[i]-(Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i]
|
|
+Gxyy[i]*Axy[i]+Gyyy[i]*Ayy[i]+Gzyy[i]*Ayz[i])-cy*Ayy[i]/c1;
|
|
double my_yz = dAyz_y[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i]
|
|
+Gxyy[i]*Axz[i]+Gyyy[i]*Ayz[i]+Gzyy[i]*Azz[i])-cy*Ayz[i]/c1;
|
|
double my_zz = dAzz_y[i]-(Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i]
|
|
+Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cy*Azz[i]/c1;
|
|
|
|
double mz_xx = dAxx_z[i]-(Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i]
|
|
+Gxxz[i]*Axx[i]+Gyxz[i]*Axy[i]+Gzxz[i]*Axz[i])-cz*Axx[i]/c1;
|
|
double mz_xy = dAxy_z[i]-(Gxyz[i]*Axx[i]+Gyyz[i]*Axy[i]+Gzyz[i]*Axz[i]
|
|
+Gxxz[i]*Axy[i]+Gyxz[i]*Ayy[i]+Gzxz[i]*Ayz[i])-cz*Axy[i]/c1;
|
|
double mz_xz = dAxz_z[i]-(Gxzz[i]*Axx[i]+Gyzz[i]*Axy[i]+Gzzz[i]*Axz[i]
|
|
+Gxxz[i]*Axz[i]+Gyxz[i]*Ayz[i]+Gzxz[i]*Azz[i])-cz*Axz[i]/c1;
|
|
double mz_yy = dAyy_z[i]-(Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i]
|
|
+Gxyz[i]*Axy[i]+Gyyz[i]*Ayy[i]+Gzyz[i]*Ayz[i])-cz*Ayy[i]/c1;
|
|
double mz_yz = dAyz_z[i]-(Gxzz[i]*Axy[i]+Gyzz[i]*Ayy[i]+Gzzz[i]*Ayz[i]
|
|
+Gxyz[i]*Axz[i]+Gyyz[i]*Ayz[i]+Gzyz[i]*Azz[i])-cz*Ayz[i]/c1;
|
|
double mz_zz = dAzz_z[i]-(Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i]
|
|
+Gxzz[i]*Axz[i]+Gyzz[i]*Ayz[i]+Gzzz[i]*Azz[i])-cz*Azz[i]/c1;
|
|
|
|
movx_Res[i] = uxx*mx_xx+uyy*my_xy+uzz*mz_xz
|
|
+uxy*mx_xy+uxz*mx_xz+uyz*my_xz
|
|
+uxy*my_xx+uxz*mz_xx+uyz*mz_xy
|
|
- F2o3*Kx[i] - F8*PI_V*Sx_m[i];
|
|
movy_Res[i] = uxx*mx_xy+uyy*my_yy+uzz*mz_yz
|
|
+uxy*mx_yy+uxz*mx_yz+uyz*my_yz
|
|
+uxy*my_xy+uxz*mz_xy+uyz*mz_yy
|
|
- F2o3*Ky[i] - F8*PI_V*Sy_m[i];
|
|
movz_Res[i] = uxx*mx_xz+uyy*my_yz+uzz*mz_zz
|
|
+uxy*mx_yz+uxz*mx_zz+uyz*my_zz
|
|
+uxy*my_xz+uxz*mz_xz+uyz*mz_yz
|
|
- F2o3*Kz[i] - F8*PI_V*Sz_m[i];
|
|
}
|
|
}
|
|
|
|
__global__ __launch_bounds__(128, 4)
|
|
void kern_escalar_constraint_fr(
|
|
const double* __restrict__ chin1,
|
|
const double* __restrict__ gupxx, const double* __restrict__ gupxy,
|
|
const double* __restrict__ gupxz, const double* __restrict__ gupyy,
|
|
const double* __restrict__ gupyz, const double* __restrict__ gupzz,
|
|
const double* __restrict__ trK,
|
|
const double* __restrict__ Axx, const double* __restrict__ Axy,
|
|
const double* __restrict__ Axz, const double* __restrict__ Ayy,
|
|
const double* __restrict__ Ayz, const double* __restrict__ Azz,
|
|
const double* __restrict__ Rxx, const double* __restrict__ Rxy,
|
|
const double* __restrict__ Rxz, const double* __restrict__ Ryy,
|
|
const double* __restrict__ Ryz, const double* __restrict__ Rzz,
|
|
const double* __restrict__ rho,
|
|
const double* __restrict__ Sxx, const double* __restrict__ Sxy,
|
|
const double* __restrict__ Sxz, const double* __restrict__ Syy,
|
|
const double* __restrict__ Syz, const double* __restrict__ Szz,
|
|
const double* __restrict__ Sphi,
|
|
double a2,
|
|
double* __restrict__ Cons_fR)
|
|
{
|
|
const double TWO = 2.0;
|
|
const double F2o3 = 2.0 / 3.0;
|
|
const double F8 = 8.0;
|
|
const double PI_V = 3.14159265358979323846;
|
|
const double SQRT3OPI_OVER4 = 0.25 * sqrt(3.0 / PI_V);
|
|
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < d_gp.all;
|
|
i += blockDim.x * gridDim.x)
|
|
{
|
|
const double uxx = gupxx[i], uxy = gupxy[i], uxz = gupxz[i];
|
|
const double uyy = gupyy[i], uyz = gupyz[i], uzz = gupzz[i];
|
|
const double c1 = chin1[i];
|
|
|
|
const double AijAij =
|
|
uxx * (uxx * Axx[i] * Axx[i] + uyy * Axy[i] * Axy[i] + uzz * Axz[i] * Axz[i]
|
|
+ TWO * (uxy * Axx[i] * Axy[i] + uxz * Axx[i] * Axz[i] + uyz * Axy[i] * Axz[i]))
|
|
+ uyy * (uxx * Axy[i] * Axy[i] + uyy * Ayy[i] * Ayy[i] + uzz * Ayz[i] * Ayz[i]
|
|
+ TWO * (uxy * Axy[i] * Ayy[i] + uxz * Axy[i] * Ayz[i] + uyz * Ayy[i] * Ayz[i]))
|
|
+ uzz * (uxx * Axz[i] * Axz[i] + uyy * Ayz[i] * Ayz[i] + uzz * Azz[i] * Azz[i]
|
|
+ TWO * (uxy * Axz[i] * Ayz[i] + uxz * Axz[i] * Azz[i] + uyz * Ayz[i] * Azz[i]))
|
|
+ TWO * (uxy * (uxx * Axx[i] * Axy[i] + uyy * Axy[i] * Ayy[i] + uzz * Axz[i] * Ayz[i]
|
|
+ uxy * (Axx[i] * Ayy[i] + Axy[i] * Axy[i])
|
|
+ uxz * (Axx[i] * Ayz[i] + Axz[i] * Axy[i])
|
|
+ uyz * (Axy[i] * Ayz[i] + Axz[i] * Ayy[i]))
|
|
+ uxz * (uxx * Axx[i] * Axz[i] + uyy * Axy[i] * Ayz[i] + uzz * Axz[i] * Azz[i]
|
|
+ uxy * (Axx[i] * Ayz[i] + Axy[i] * Axz[i])
|
|
+ uxz * (Axx[i] * Azz[i] + Axz[i] * Axz[i])
|
|
+ uyz * (Axy[i] * Azz[i] + Axz[i] * Ayz[i]))
|
|
+ uyz * (uxx * Axy[i] * Axz[i] + uyy * Ayy[i] * Ayz[i] + uzz * Ayz[i] * Azz[i]
|
|
+ uxy * (Axy[i] * Ayz[i] + Ayy[i] * Axz[i])
|
|
+ uxz * (Axy[i] * Azz[i] + Ayz[i] * Axz[i])
|
|
+ uyz * (Ayy[i] * Azz[i] + Ayz[i] * Ayz[i])));
|
|
|
|
const double R_sc = uxx * Rxx[i] + uyy * Ryy[i] + uzz * Rzz[i]
|
|
+ TWO * (uxy * Rxy[i] + uxz * Rxz[i] + uyz * Ryz[i]);
|
|
const double trS = uxx * Sxx[i] + uyy * Syy[i] + uzz * Szz[i]
|
|
+ TWO * (uxy * Sxy[i] + uxz * Sxz[i] + uyz * Syz[i]);
|
|
const double RR = AijAij - F2o3 * trK[i] * trK[i]
|
|
- R_sc * c1
|
|
- F8 * PI_V * (3.0 * rho[i] - trS * c1);
|
|
const double fprim = 1.0 + 2.0 * a2 * RR;
|
|
Cons_fR[i] = Sphi[i] - SQRT3OPI_OVER4 * log(fprim);
|
|
}
|
|
}
|
|
|
|
static void setup_grid_params(int *ex,
|
|
double *X, double *Y, double *Z,
|
|
int Symmetry, double eps, int co)
|
|
{
|
|
const int nx = ex[0];
|
|
const int ny = ex[1];
|
|
const int nz = ex[2];
|
|
const double dX = X[1] - X[0];
|
|
const double dY = Y[1] - Y[0];
|
|
const double dZ = Z[1] - Z[0];
|
|
const int NO_SYMM = 0;
|
|
const int EQ_SYMM = 1;
|
|
|
|
ensure_gpu_buffers(nx, ny, nz);
|
|
|
|
GridParams gp = {};
|
|
gp.ex[0] = nx;
|
|
gp.ex[1] = ny;
|
|
gp.ex[2] = nz;
|
|
gp.all = nx * ny * nz;
|
|
gp.dX = dX;
|
|
gp.dY = dY;
|
|
gp.dZ = dZ;
|
|
gp.d12dx = 1.0 / (12.0 * dX);
|
|
gp.d12dy = 1.0 / (12.0 * dY);
|
|
gp.d12dz = 1.0 / (12.0 * dZ);
|
|
gp.d2dx = 1.0 / (2.0 * dX);
|
|
gp.d2dy = 1.0 / (2.0 * dY);
|
|
gp.d2dz = 1.0 / (2.0 * dZ);
|
|
gp.Fdxdx = 1.0 / (12.0 * dX * dX);
|
|
gp.Fdydy = 1.0 / (12.0 * dY * dY);
|
|
gp.Fdzdz = 1.0 / (12.0 * dZ * dZ);
|
|
gp.Sdxdx = 1.0 / (dX * dX);
|
|
gp.Sdydy = 1.0 / (dY * dY);
|
|
gp.Sdzdz = 1.0 / (dZ * dZ);
|
|
gp.Fdxdy = 1.0 / (144.0 * dX * dY);
|
|
gp.Fdxdz = 1.0 / (144.0 * dX * dZ);
|
|
gp.Fdydz = 1.0 / (144.0 * dY * dZ);
|
|
gp.Sdxdy = 0.25 / (dX * dY);
|
|
gp.Sdxdz = 0.25 / (dX * dZ);
|
|
gp.Sdydz = 0.25 / (dY * dZ);
|
|
gp.iminF = 1;
|
|
gp.jminF = 1;
|
|
gp.kminF = 1;
|
|
gp.imaxF = nx;
|
|
gp.jmaxF = ny;
|
|
gp.kmaxF = nz;
|
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = 2 - ghost_width;
|
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = 2 - ghost_width;
|
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = 2 - ghost_width;
|
|
gp.iminF3 = 1;
|
|
gp.jminF3 = 1;
|
|
gp.kminF3 = 1;
|
|
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = 1 - ghost_width;
|
|
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = 1 - ghost_width;
|
|
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = 1 - ghost_width;
|
|
gp.Symmetry = Symmetry;
|
|
gp.eps = eps;
|
|
gp.co = co;
|
|
gp.fh2_nx = nx + 2;
|
|
gp.fh2_ny = ny + 2;
|
|
gp.fh2_nz = nz + 2;
|
|
gp.fh3_nx = nx + 3;
|
|
gp.fh3_ny = ny + 3;
|
|
gp.fh3_nz = nz + 3;
|
|
upload_grid_params_if_needed(gp);
|
|
}
|
|
|
|
static void compute_patch_boundary_flags(int *ex,
|
|
double *X, double *Y, double *Z,
|
|
const double *bbox,
|
|
int Symmetry,
|
|
int &touch_xmin, int &touch_xmax,
|
|
int &touch_ymin, int &touch_ymax,
|
|
int &touch_zmin, int &touch_zmax)
|
|
{
|
|
const double dX = X[1] - X[0];
|
|
const double dY = Y[1] - Y[0];
|
|
const double dZ = Z[1] - Z[0];
|
|
const int NO_SYMM = 0;
|
|
const int OCTANT = 2;
|
|
|
|
touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0;
|
|
touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0;
|
|
touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0;
|
|
|
|
touch_xmin = (std::fabs(X[0] - bbox[0]) < dX &&
|
|
!(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0;
|
|
touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY &&
|
|
!(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0;
|
|
touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ &&
|
|
!(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0;
|
|
}
|
|
|
|
static void upload_state_inputs(double **state_host, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
const bool profile = cuda_profile_enabled();
|
|
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
static int direct_upload = -1;
|
|
if (direct_upload < 0) {
|
|
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_UPLOAD");
|
|
direct_upload = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
|
}
|
|
if (direct_upload) {
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_state_input_slots[i]], state_host[i],
|
|
bytes, cudaMemcpyHostToDevice));
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
stats.upload_calls++;
|
|
stats.upload_ms += cuda_profile_now_ms() - t0;
|
|
stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
|
|
}
|
|
return;
|
|
}
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
|
|
}
|
|
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
|
|
(size_t)BSSN_STATE_COUNT * bytes,
|
|
cudaMemcpyHostToDevice));
|
|
if (profile) {
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
stats.upload_calls++;
|
|
stats.upload_ms += cuda_profile_now_ms() - t0;
|
|
stats.upload_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9;
|
|
}
|
|
}
|
|
|
|
static void upload_matter_cache(StepContext &ctx,
|
|
double **matter_host,
|
|
size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
|
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
|
|
}
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
|
|
(size_t)BSSN_MATTER_COUNT * bytes,
|
|
cudaMemcpyHostToDevice));
|
|
ctx.matter_ready = true;
|
|
}
|
|
|
|
static void zero_matter_cache(StepContext &ctx, size_t all)
|
|
{
|
|
CUDA_CHECK(cudaMemset(ctx.d_matter_mem, 0,
|
|
(size_t)BSSN_MATTER_COUNT * all * sizeof(double)));
|
|
ctx.matter_ready = true;
|
|
}
|
|
|
|
static void bind_matter_slots(const StepContext &ctx)
|
|
{
|
|
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
|
|
g_buf.slot[k_matter_slots[i]] = ctx.d_matter[i];
|
|
}
|
|
}
|
|
|
|
static void bind_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
|
|
{
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
g_buf.slot[k_state_input_slots[i]] = state[i];
|
|
}
|
|
}
|
|
|
|
static void bind_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
|
|
{
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
g_buf.slot[k_state_rhs_slots[i]] = state[i];
|
|
}
|
|
}
|
|
|
|
static void bind_escalar_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
|
|
{
|
|
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
|
g_buf.slot[k_escalar_state_input_slots[i]] = state[i];
|
|
}
|
|
}
|
|
|
|
static void bind_escalar_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
|
|
{
|
|
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
|
g_buf.slot[k_escalar_state_rhs_slots[i]] = state[i];
|
|
}
|
|
}
|
|
|
|
static void bind_em_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
|
|
{
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
g_buf.slot[k_em_state_input_slots[i]] = state[i];
|
|
}
|
|
}
|
|
|
|
static void bind_em_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
|
|
{
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
g_buf.slot[k_em_state_rhs_slots[i]] = state[i];
|
|
}
|
|
}
|
|
|
|
static void upload_escalar_state_inputs(double **state_host, size_t all);
|
|
static void upload_em_state_inputs(double **state_host, size_t all);
|
|
|
|
static bool resident_key_matches_count(const StepContext &ctx, int bank, double **host_key, int state_count)
|
|
{
|
|
if (!host_key || bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT)
|
|
return false;
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return false;
|
|
for (int i = 0; i < state_count; ++i) {
|
|
if (!host_key[i] || ctx.resident_host[bank][i] != host_key[i])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool resident_key_matches(const StepContext &ctx, int bank, double **host_key)
|
|
{
|
|
return resident_key_matches_count(ctx, bank, host_key, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static int find_resident_bank_count(const StepContext &ctx, double **host_key, int state_count)
|
|
{
|
|
if (!host_key) return -1;
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
int best_invalid = -1;
|
|
unsigned long long best_invalid_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (!resident_key_matches_count(ctx, b, host_key, state_count))
|
|
continue;
|
|
if (ctx.resident_valid[b]) {
|
|
if (best < 0 || ctx.resident_age[b] > best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
} else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
|
|
best_invalid = b;
|
|
best_invalid_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
return (best >= 0) ? best : best_invalid;
|
|
}
|
|
|
|
static int find_resident_bank_subset(const StepContext &ctx,
|
|
double **host_key,
|
|
const int *state_indices,
|
|
int subset_count)
|
|
{
|
|
if (!host_key || !state_indices || subset_count <= 0)
|
|
return -1;
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
int best_invalid = -1;
|
|
unsigned long long best_invalid_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
bool match = true;
|
|
for (int i = 0; i < subset_count; ++i) {
|
|
const int state_index = state_indices[i];
|
|
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY ||
|
|
!host_key[i] ||
|
|
ctx.resident_host[b][state_index] != host_key[i]) {
|
|
match = false;
|
|
break;
|
|
}
|
|
}
|
|
if (!match)
|
|
continue;
|
|
if (ctx.resident_valid[b]) {
|
|
if (best < 0 || ctx.resident_age[b] > best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
} else if (best_invalid < 0 || ctx.resident_age[b] > best_invalid_age) {
|
|
best_invalid = b;
|
|
best_invalid_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
return (best >= 0) ? best : best_invalid;
|
|
}
|
|
|
|
static int find_resident_bank(const StepContext &ctx, double **host_key)
|
|
{
|
|
return find_resident_bank_count(ctx, host_key, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static bool resident_key_usable_count(double **host_key, int state_count)
|
|
{
|
|
if (!host_key) return false;
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return false;
|
|
for (int i = 0; i < state_count; ++i) {
|
|
if (!host_key[i]) return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static bool resident_key_usable(double **host_key)
|
|
{
|
|
return resident_key_usable_count(host_key, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static void set_resident_host_clean(StepContext &ctx, int bank, bool clean)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
|
|
ctx.resident_host_clean[bank].fill(clean ? 1 : 0);
|
|
}
|
|
|
|
static bool resident_host_subset_clean(const StepContext &ctx,
|
|
int bank,
|
|
int subset_count,
|
|
const int *state_indices)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return false;
|
|
for (int i = 0; i < subset_count; ++i) {
|
|
const int state_index = state_indices ? state_indices[i] : i;
|
|
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY)
|
|
return false;
|
|
if (!ctx.resident_host_clean[bank][state_index])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void mark_resident_host_subset_clean(StepContext &ctx,
|
|
int bank,
|
|
int subset_count,
|
|
const int *state_indices,
|
|
bool clean)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
|
|
for (int i = 0; i < subset_count; ++i) {
|
|
const int state_index = state_indices ? state_indices[i] : i;
|
|
if (state_index >= 0 && state_index < BSSN_RESIDENT_STATE_CAPACITY)
|
|
ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
|
|
}
|
|
}
|
|
|
|
static void mark_resident_host_state_clean(StepContext &ctx,
|
|
int bank,
|
|
int state_index,
|
|
bool clean)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
|
|
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0;
|
|
}
|
|
|
|
static void mark_resident_current_bank(StepContext &ctx, int bank)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
|
|
ctx.current_bank = bank;
|
|
ctx.d_state_curr_mem = ctx.d_resident_mem[bank];
|
|
ctx.d_state_curr = ctx.d_resident[bank];
|
|
ctx.state_ready = ctx.resident_valid[bank];
|
|
}
|
|
|
|
static void mark_resident_next_bank(StepContext &ctx, int bank)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
|
|
ctx.d_state_next_mem = ctx.d_resident_mem[bank];
|
|
ctx.d_state_next = ctx.d_resident[bank];
|
|
}
|
|
|
|
static bool any_resident_bank_valid(const StepContext &ctx)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (ctx.resident_valid[b]) return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void update_state_ready(StepContext &ctx)
|
|
{
|
|
ctx.state_ready = any_resident_bank_valid(ctx);
|
|
}
|
|
|
|
static void writeback_resident_bank_count(StepContext &ctx, int bank, size_t all, int state_count)
|
|
{
|
|
if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return;
|
|
if (!ctx.resident_valid[bank]) return;
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
for (int i = 0; i < state_count; ++i) {
|
|
if (!ctx.resident_host[bank][i]) return;
|
|
}
|
|
const bool profile = cuda_aux_profile_enabled();
|
|
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int i = 0; i < state_count; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i],
|
|
ctx.d_resident[bank][i],
|
|
bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
set_resident_host_clean(ctx, bank, true);
|
|
if (profile) {
|
|
CudaAuxProfileStats &stats = cuda_aux_profile_stats();
|
|
stats.writeback_calls++;
|
|
stats.writeback_ms += cuda_profile_now_ms() - t0;
|
|
stats.writeback_gb += (double)((size_t)state_count * bytes) / 1.0e9;
|
|
cuda_aux_profile_maybe_log();
|
|
}
|
|
}
|
|
|
|
static void writeback_resident_bank(StepContext &ctx, int bank, size_t all)
|
|
{
|
|
writeback_resident_bank_count(ctx, bank, all, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (b != avoid_bank && !ctx.resident_valid[b])
|
|
return b;
|
|
}
|
|
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (b == avoid_bank) continue;
|
|
if (best < 0 || ctx.resident_age[b] < best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
if (best < 0) best = 0;
|
|
writeback_resident_bank(ctx, best, all);
|
|
ctx.resident_valid[best] = false;
|
|
ctx.resident_host[best].fill(nullptr);
|
|
ctx.resident_host_clean[best].fill(0);
|
|
ctx.resident_age[best] = 0;
|
|
if (ctx.current_bank == best) {
|
|
ctx.current_bank = -1;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
}
|
|
update_state_ready(ctx);
|
|
return best;
|
|
}
|
|
|
|
static int choose_escalar_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (b != avoid_bank && !ctx.resident_valid[b])
|
|
return b;
|
|
}
|
|
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (b == avoid_bank) continue;
|
|
if (best < 0 || ctx.resident_age[b] < best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
if (best < 0) best = 0;
|
|
writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
|
|
ctx.resident_valid[best] = false;
|
|
ctx.resident_host[best].fill(nullptr);
|
|
ctx.resident_host_clean[best].fill(0);
|
|
ctx.resident_age[best] = 0;
|
|
if (ctx.current_bank == best) {
|
|
ctx.current_bank = -1;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
}
|
|
update_state_ready(ctx);
|
|
return best;
|
|
}
|
|
|
|
static int choose_em_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (b != avoid_bank && !ctx.resident_valid[b])
|
|
return b;
|
|
}
|
|
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (b == avoid_bank) continue;
|
|
if (best < 0 || ctx.resident_age[b] < best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
if (best < 0) best = 0;
|
|
writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT);
|
|
ctx.resident_valid[best] = false;
|
|
ctx.resident_host[best].fill(nullptr);
|
|
ctx.resident_host_clean[best].fill(0);
|
|
ctx.resident_age[best] = 0;
|
|
if (ctx.current_bank == best) {
|
|
ctx.current_bank = -1;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
}
|
|
update_state_ready(ctx);
|
|
return best;
|
|
}
|
|
|
|
static void assign_resident_key_count(StepContext &ctx, int bank, double **host_key, int state_count)
|
|
{
|
|
for (int i = 0; i < state_count; ++i) {
|
|
ctx.resident_host[bank][i] = host_key[i];
|
|
}
|
|
set_resident_host_clean(ctx, bank, false);
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
}
|
|
|
|
static void assign_resident_key(StepContext &ctx, int bank, double **host_key)
|
|
{
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static int ensure_resident_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int avoid_bank = -1)
|
|
{
|
|
if (!resident_key_usable(host_key)) {
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
return 0;
|
|
}
|
|
|
|
int bank = find_resident_bank(ctx, host_key);
|
|
if (bank >= 0) {
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
if (!ctx.resident_valid[bank] && upload_if_missing) {
|
|
bind_state_input_slots(ctx.d_resident[bank]);
|
|
upload_state_inputs(host_key, all);
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
}
|
|
return bank;
|
|
}
|
|
|
|
bank = choose_resident_bank_for_reuse(ctx, avoid_bank, all);
|
|
assign_resident_key(ctx, bank, host_key);
|
|
if (upload_if_missing) {
|
|
bind_state_input_slots(ctx.d_resident[bank]);
|
|
upload_state_inputs(host_key, all);
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
} else {
|
|
ctx.resident_valid[bank] = false;
|
|
set_resident_host_clean(ctx, bank, false);
|
|
}
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int ensure_escalar_resident_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int avoid_bank = -1)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
return 0;
|
|
}
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
if (bank >= 0) {
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
if (!ctx.resident_valid[bank] && upload_if_missing) {
|
|
bind_escalar_state_input_slots(ctx.d_resident[bank]);
|
|
upload_escalar_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
}
|
|
return bank;
|
|
}
|
|
|
|
bank = choose_escalar_resident_bank_for_reuse(ctx, avoid_bank, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
if (upload_if_missing) {
|
|
bind_escalar_state_input_slots(ctx.d_resident[bank]);
|
|
upload_escalar_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
} else {
|
|
ctx.resident_valid[bank] = false;
|
|
set_resident_host_clean(ctx, bank, false);
|
|
}
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int ensure_em_resident_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int avoid_bank = -1)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
return 0;
|
|
}
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
|
|
if (bank >= 0) {
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
if (!ctx.resident_valid[bank] && upload_if_missing) {
|
|
bind_em_state_input_slots(ctx.d_resident[bank]);
|
|
upload_em_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
}
|
|
return bank;
|
|
}
|
|
|
|
bank = choose_em_resident_bank_for_reuse(ctx, avoid_bank, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
|
|
if (upload_if_missing) {
|
|
bind_em_state_input_slots(ctx.d_resident[bank]);
|
|
upload_em_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
} else {
|
|
ctx.resident_valid[bank] = false;
|
|
set_resident_host_clean(ctx, bank, false);
|
|
}
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int reserve_resident_output_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
int input_bank)
|
|
{
|
|
if (!resident_key_usable(host_key))
|
|
return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
|
|
if (resident_key_matches(ctx, input_bank, host_key))
|
|
return input_bank;
|
|
|
|
int bank = find_resident_bank(ctx, host_key);
|
|
if (bank < 0)
|
|
bank = choose_resident_bank_for_reuse(ctx, input_bank, all);
|
|
assign_resident_key(ctx, bank, host_key);
|
|
ctx.resident_valid[bank] = false;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int reserve_escalar_resident_output_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
int input_bank)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT))
|
|
return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
|
|
if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_ESCALAR_STATE_COUNT))
|
|
return input_bank;
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
if (bank < 0)
|
|
bank = choose_escalar_resident_bank_for_reuse(ctx, input_bank, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
ctx.resident_valid[bank] = false;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int reserve_em_resident_output_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
int input_bank)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT))
|
|
return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
|
|
if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_EM_STATE_COUNT))
|
|
return input_bank;
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
|
|
if (bank < 0)
|
|
bank = choose_em_resident_bank_for_reuse(ctx, input_bank, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
|
|
ctx.resident_valid[bank] = false;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c);
|
|
|
|
static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c,
|
|
size_t all);
|
|
|
|
static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c,
|
|
size_t all);
|
|
|
|
static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT))
|
|
return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
|
|
if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_ESCALAR_STATE_COUNT))
|
|
return avoid_a;
|
|
if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_ESCALAR_STATE_COUNT))
|
|
return avoid_b;
|
|
if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_ESCALAR_STATE_COUNT))
|
|
return avoid_c;
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
if (bank < 0) {
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) {
|
|
bank = b;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (bank < 0)
|
|
bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
ctx.resident_valid[bank] = false;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int reserve_em_resident_output_bank_avoiding(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT))
|
|
return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
|
|
if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_EM_STATE_COUNT))
|
|
return avoid_a;
|
|
if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_EM_STATE_COUNT))
|
|
return avoid_b;
|
|
if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_EM_STATE_COUNT))
|
|
return avoid_c;
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
|
|
if (bank < 0) {
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) {
|
|
bank = b;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (bank < 0)
|
|
bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
|
|
ctx.resident_valid[bank] = false;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c)
|
|
{
|
|
return bank == avoid_a || bank == avoid_b || bank == avoid_c;
|
|
}
|
|
|
|
static int choose_resident_bank_for_reuse_avoiding(StepContext &ctx,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c,
|
|
size_t all)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
|
|
return b;
|
|
}
|
|
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
|
|
if (best < 0 || ctx.resident_age[b] < best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
if (best < 0)
|
|
return choose_resident_bank_for_reuse(ctx, avoid_a, all);
|
|
|
|
writeback_resident_bank(ctx, best, all);
|
|
ctx.resident_valid[best] = false;
|
|
ctx.resident_host[best].fill(nullptr);
|
|
ctx.resident_host_clean[best].fill(0);
|
|
ctx.resident_age[best] = 0;
|
|
if (ctx.current_bank == best) {
|
|
ctx.current_bank = -1;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
}
|
|
update_state_ready(ctx);
|
|
return best;
|
|
}
|
|
|
|
static int reserve_resident_output_bank_avoiding(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c)
|
|
{
|
|
if (!resident_key_usable(host_key))
|
|
return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
|
|
if (resident_key_matches(ctx, avoid_a, host_key))
|
|
return avoid_a;
|
|
if (resident_key_matches(ctx, avoid_b, host_key))
|
|
return avoid_b;
|
|
if (resident_key_matches(ctx, avoid_c, host_key))
|
|
return avoid_c;
|
|
|
|
int bank = find_resident_bank(ctx, host_key);
|
|
if (bank < 0)
|
|
bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
|
|
assign_resident_key(ctx, bank, host_key);
|
|
ctx.resident_valid[bank] = false;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c,
|
|
size_t all)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
|
|
return b;
|
|
}
|
|
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
|
|
if (best < 0 || ctx.resident_age[b] < best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
if (best < 0)
|
|
return choose_escalar_resident_bank_for_reuse(ctx, avoid_a, all);
|
|
|
|
writeback_resident_bank_count(ctx, best, all, BSSN_ESCALAR_STATE_COUNT);
|
|
ctx.resident_valid[best] = false;
|
|
ctx.resident_host[best].fill(nullptr);
|
|
ctx.resident_host_clean[best].fill(0);
|
|
ctx.resident_age[best] = 0;
|
|
if (ctx.current_bank == best) {
|
|
ctx.current_bank = -1;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
}
|
|
update_state_ready(ctx);
|
|
return best;
|
|
}
|
|
|
|
static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c,
|
|
size_t all)
|
|
{
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
|
|
return b;
|
|
}
|
|
|
|
int best = -1;
|
|
unsigned long long best_age = 0;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
|
|
if (best < 0 || ctx.resident_age[b] < best_age) {
|
|
best = b;
|
|
best_age = ctx.resident_age[b];
|
|
}
|
|
}
|
|
if (best < 0)
|
|
return choose_em_resident_bank_for_reuse(ctx, avoid_a, all);
|
|
|
|
writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT);
|
|
ctx.resident_valid[best] = false;
|
|
ctx.resident_host[best].fill(nullptr);
|
|
ctx.resident_host_clean[best].fill(0);
|
|
ctx.resident_age[best] = 0;
|
|
if (ctx.current_bank == best) {
|
|
ctx.current_bank = -1;
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
}
|
|
update_state_ready(ctx);
|
|
return best;
|
|
}
|
|
|
|
static int ensure_resident_bank_avoiding(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c)
|
|
{
|
|
if (!resident_key_usable(host_key)) {
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
return 0;
|
|
}
|
|
|
|
int bank = find_resident_bank(ctx, host_key);
|
|
if (bank >= 0) {
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
if (!ctx.resident_valid[bank] && upload_if_missing) {
|
|
bind_state_input_slots(ctx.d_resident[bank]);
|
|
upload_state_inputs(host_key, all);
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
}
|
|
return bank;
|
|
}
|
|
|
|
bank = choose_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
|
|
assign_resident_key(ctx, bank, host_key);
|
|
if (upload_if_missing) {
|
|
bind_state_input_slots(ctx.d_resident[bank]);
|
|
upload_state_inputs(host_key, all);
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
} else {
|
|
ctx.resident_valid[bank] = false;
|
|
set_resident_host_clean(ctx, bank, false);
|
|
}
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int ensure_em_resident_bank_avoiding(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
return 0;
|
|
}
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
|
|
if (bank >= 0) {
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
if (!ctx.resident_valid[bank] && upload_if_missing) {
|
|
bind_em_state_input_slots(ctx.d_resident[bank]);
|
|
upload_em_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
}
|
|
return bank;
|
|
}
|
|
|
|
bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
|
|
if (upload_if_missing) {
|
|
bind_em_state_input_slots(ctx.d_resident[bank]);
|
|
upload_em_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
} else {
|
|
ctx.resident_valid[bank] = false;
|
|
set_resident_host_clean(ctx, bank, false);
|
|
}
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int ensure_escalar_resident_bank_avoiding(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int avoid_a,
|
|
int avoid_b,
|
|
int avoid_c)
|
|
{
|
|
if (!resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
return 0;
|
|
}
|
|
|
|
int bank = find_resident_bank_count(ctx, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
if (bank >= 0) {
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
if (!ctx.resident_valid[bank] && upload_if_missing) {
|
|
bind_escalar_state_input_slots(ctx.d_resident[bank]);
|
|
upload_escalar_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
}
|
|
return bank;
|
|
}
|
|
|
|
bank = choose_escalar_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
|
|
assign_resident_key_count(ctx, bank, host_key, BSSN_ESCALAR_STATE_COUNT);
|
|
if (upload_if_missing) {
|
|
bind_escalar_state_input_slots(ctx.d_resident[bank]);
|
|
upload_escalar_state_inputs(host_key, all);
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
} else {
|
|
ctx.resident_valid[bank] = false;
|
|
set_resident_host_clean(ctx, bank, false);
|
|
}
|
|
update_state_ready(ctx);
|
|
return bank;
|
|
}
|
|
|
|
static int active_or_keyed_bank(StepContext &ctx,
|
|
double **host_key,
|
|
size_t all,
|
|
bool upload_if_missing,
|
|
int state_count = BSSN_STATE_COUNT)
|
|
{
|
|
if (state_count == BSSN_ESCALAR_STATE_COUNT &&
|
|
resident_key_usable_count(host_key, BSSN_ESCALAR_STATE_COUNT)) {
|
|
int bank = ensure_escalar_resident_bank(ctx, host_key, all, upload_if_missing);
|
|
mark_resident_current_bank(ctx, bank);
|
|
return bank;
|
|
}
|
|
if (state_count == BSSN_EM_STATE_COUNT &&
|
|
resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
|
|
int bank = ensure_em_resident_bank(ctx, host_key, all, upload_if_missing);
|
|
mark_resident_current_bank(ctx, bank);
|
|
return bank;
|
|
}
|
|
if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) {
|
|
int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
|
|
mark_resident_current_bank(ctx, bank);
|
|
return bank;
|
|
}
|
|
if (ctx.current_bank >= 0)
|
|
return ctx.current_bank;
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
if (ctx.resident_valid[b]) {
|
|
mark_resident_current_bank(ctx, b);
|
|
return b;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false,
|
|
double escalar_a2 = 3.0)
|
|
{
|
|
const double SYM = 1.0;
|
|
const double ANTI = -1.0;
|
|
const bool stage_timing = rhs_stage_timing_enabled();
|
|
double stage_ms[RHS_STAGE_COUNT] = {};
|
|
double stage_t0 = stage_timing ? cuda_profile_now_ms() : 0.0;
|
|
|
|
#define D(s) g_buf.slot[s]
|
|
#define MARK_RHS_STAGE(stage_id) do { \
|
|
if (stage_timing) { \
|
|
cuda_profile_sync(); \
|
|
const double stage_t1 = cuda_profile_now_ms(); \
|
|
stage_ms[(stage_id)] += stage_t1 - stage_t0; \
|
|
stage_t0 = stage_t1; \
|
|
} \
|
|
} while (0)
|
|
|
|
kern_phase1_prep<<<grid(all),BLK>>>(
|
|
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
|
|
D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
|
|
MARK_RHS_STAGE(RHS_STAGE_PREP);
|
|
|
|
{
|
|
double *src_fields[] = {
|
|
D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
|
|
D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy),
|
|
D(S_gyz), D(S_dzz), D(S_Lap), D(S_trK)
|
|
};
|
|
double *fx_fields[] = {
|
|
D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx),
|
|
D(S_gyzx), D(S_gzzx), D(S_Lapx), D(S_Kx)
|
|
};
|
|
double *fy_fields[] = {
|
|
D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy),
|
|
D(S_gyzy), D(S_gzzy), D(S_Lapy), D(S_Ky)
|
|
};
|
|
double *fz_fields[] = {
|
|
D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz),
|
|
D(S_gyzz), D(S_gzzz), D(S_Lapz), D(S_Kz)
|
|
};
|
|
const int soa_signs[] = {
|
|
(int)ANTI, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)ANTI, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)ANTI,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)ANTI, (int)ANTI, (int)SYM,
|
|
(int)ANTI, (int)SYM, (int)ANTI,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)ANTI, (int)ANTI,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM
|
|
};
|
|
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
|
src_fields, fx_fields, fy_fields, fz_fields,
|
|
soa_signs, all);
|
|
}
|
|
MARK_RHS_STAGE(RHS_STAGE_DERIV1);
|
|
|
|
kern_phase2_metric_rhs<<<grid(all),BLK>>>(
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz),
|
|
D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
|
|
D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));
|
|
|
|
kern_phase2_inverse<<<grid(all),BLK>>>(
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz));
|
|
|
|
if (compute_escalar) {
|
|
gpu_escalar_sources(all, escalar_a2);
|
|
gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all);
|
|
}
|
|
|
|
if (co == 0) {
|
|
kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
|
|
D(S_Gamx), D(S_Gamy), D(S_Gamz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
|
|
}
|
|
|
|
kern_phase4_christoffel<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));
|
|
|
|
kern_phase5_6_gamma_rhs_part1_fused<<<grid(all),BLK>>>(
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
compute_escalar ? D(S_trK_x) : D(S_Kx),
|
|
compute_escalar ? D(S_trK_y) : D(S_Ky),
|
|
compute_escalar ? D(S_trK_z) : D(S_Kz),
|
|
D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
|
|
MARK_RHS_STAGE(RHS_STAGE_METRIC);
|
|
|
|
{
|
|
double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)};
|
|
double *fxx_fields[] = {D(S_gxxx), D(S_gxxy), D(S_gxxz)};
|
|
double *fxy_fields[] = {D(S_gxyx), D(S_gxyy), D(S_gxyz)};
|
|
double *fxz_fields[] = {D(S_gxzx), D(S_gxzy), D(S_gxzz)};
|
|
double *fyy_fields[] = {D(S_gyyx), D(S_gyyy), D(S_gyyz)};
|
|
double *fyz_fields[] = {D(S_gyzx), D(S_gyzy), D(S_gyzz)};
|
|
double *fzz_fields[] = {D(S_gzzx), D(S_gzzy), D(S_gzzz)};
|
|
const int soa_signs[] = {
|
|
(int)ANTI, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)ANTI, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)ANTI
|
|
};
|
|
gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
|
src_fields, fxx_fields, fxy_fields, fxz_fields,
|
|
fyy_fields, fyz_fields, fzz_fields,
|
|
soa_signs, all);
|
|
}
|
|
{
|
|
double *src_fields[] = {D(S_Gamx), D(S_Gamy), D(S_Gamz)};
|
|
double *fx_fields[] = {D(S_Gamxx), D(S_Gamyx), D(S_Gamzx)};
|
|
double *fy_fields[] = {D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy)};
|
|
double *fz_fields[] = {D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t)};
|
|
const int soa_signs[] = {
|
|
(int)ANTI, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)ANTI, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)ANTI
|
|
};
|
|
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
|
src_fields, fx_fields, fy_fields, fz_fields,
|
|
soa_signs, all);
|
|
}
|
|
MARK_RHS_STAGE(RHS_STAGE_GAUGE_DERIV);
|
|
|
|
kern_phase8_9_gamma_rhs_contract_fused<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_betaxx),D(S_betaxy),D(S_betaxz),
|
|
D(S_betayx),D(S_betayy),D(S_betayz),
|
|
D(S_betazx),D(S_betazy),D(S_betazz),
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
|
MARK_RHS_STAGE(RHS_STAGE_GAMMA_CONTRACT);
|
|
|
|
{
|
|
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
|
|
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
|
|
const int soa_signs[] = {
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)ANTI, (int)ANTI, (int)SYM,
|
|
(int)ANTI, (int)SYM, (int)ANTI,
|
|
(int)SYM, (int)ANTI, (int)ANTI
|
|
};
|
|
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
src_fields, dst_fields, soa_signs, all);
|
|
}
|
|
MARK_RHS_STAGE(RHS_STAGE_RICCI_DIFF);
|
|
|
|
kern_phase11_ricci_fused<<<grid(all),BLK>>>(
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
|
|
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
|
|
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
|
|
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Rxx),D(S_Rxy),D(S_Rxz),
|
|
D(S_Ryy),D(S_Ryz),D(S_Rzz));
|
|
MARK_RHS_STAGE(RHS_STAGE_RICCI_FUSED);
|
|
|
|
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
|
|
D(S_chi), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
|
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
|
MARK_RHS_STAGE(RHS_STAGE_CHI);
|
|
|
|
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz),
|
|
D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_rho),
|
|
D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
|
|
D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
|
|
D(S_trK_rhs),
|
|
D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
|
|
D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
|
|
D(S_Lap_rhs),
|
|
D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
|
|
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
|
|
D(S_f_arr), D(S_S_arr));
|
|
MARK_RHS_STAGE(RHS_STAGE_GAUGE_RHS);
|
|
|
|
gpu_lopsided_kodis_state_batch(eps, all, compute_escalar);
|
|
MARK_RHS_STAGE(RHS_STAGE_KODIS);
|
|
|
|
if (co == 0) {
|
|
{
|
|
double *src_fields[] = {D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz)};
|
|
double *fx_fields[] = {D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx)};
|
|
double *fy_fields[] = {D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy)};
|
|
double *fz_fields[] = {D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz)};
|
|
const int soa_signs[] = {
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)ANTI, (int)ANTI, (int)SYM,
|
|
(int)ANTI, (int)SYM, (int)ANTI,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)ANTI, (int)ANTI,
|
|
(int)SYM, (int)SYM, (int)SYM
|
|
};
|
|
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
|
src_fields, fx_fields, fy_fields, fz_fields,
|
|
soa_signs, all);
|
|
}
|
|
|
|
kern_phase18_constraints<<<grid(all),BLK>>>(
|
|
D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
|
|
compute_escalar ? D(S_trK_x) : D(S_Kx),
|
|
compute_escalar ? D(S_trK_y) : D(S_Ky),
|
|
compute_escalar ? D(S_trK_z) : D(S_Kz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_gxxx), D(S_gxxy), D(S_gxxz),
|
|
D(S_gxyx), D(S_gxyy), D(S_gxyz),
|
|
D(S_gxzx), D(S_gxzy), D(S_gxzz),
|
|
D(S_gyyx), D(S_gyyy), D(S_gyyz),
|
|
D(S_gyzx), D(S_gyzy), D(S_gyzz),
|
|
D(S_gzzx), D(S_gzzy), D(S_gzzz),
|
|
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
|
|
}
|
|
MARK_RHS_STAGE(RHS_STAGE_CONSTRAINTS);
|
|
|
|
rhs_stage_profile_accumulate(stage_ms);
|
|
#undef MARK_RHS_STAGE
|
|
#undef D
|
|
}
|
|
|
|
static void download_state_outputs(double **state_host_out, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
|
|
(size_t)BSSN_STATE_COUNT * bytes,
|
|
cudaMemcpyDeviceToHost));
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
|
}
|
|
}
|
|
|
|
static void download_escalar_state_outputs(double **state_host_out, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(state_host_out[i],
|
|
g_buf.slot[k_escalar_state_rhs_slots[i]],
|
|
bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
|
|
static void download_em_state_outputs(double **state_host_out, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(state_host_out[i],
|
|
g_buf.slot[k_em_state_rhs_slots[i]],
|
|
bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
}
|
|
|
|
static void upload_escalar_state_inputs(double **state_host, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_escalar_state_input_slots[i]],
|
|
state_host[i], bytes, cudaMemcpyHostToDevice));
|
|
}
|
|
}
|
|
|
|
static void upload_em_state_inputs(double **state_host, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_em_state_input_slots[i]],
|
|
state_host[i], bytes, cudaMemcpyHostToDevice));
|
|
}
|
|
}
|
|
|
|
static bool em_source_cache_enabled()
|
|
{
|
|
static int enabled = -1;
|
|
if (enabled < 0) {
|
|
const char *env = getenv("AMSS_CUDA_EM_CACHE_SOURCES");
|
|
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
|
}
|
|
return enabled != 0;
|
|
}
|
|
|
|
static void bind_em_fixed_source_slots(StepContext &ctx)
|
|
{
|
|
const int slots[BSSN_EM_SOURCE_COUNT] = {
|
|
S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar
|
|
};
|
|
for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i)
|
|
g_buf.slot[slots[i]] = ctx.d_em_source[i];
|
|
}
|
|
|
|
static bool em_fixed_source_cache_matches(const StepContext &ctx,
|
|
double **source_host)
|
|
{
|
|
if (!ctx.em_source_ready || !source_host)
|
|
return false;
|
|
for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
|
|
if (!source_host[i] || ctx.em_source_host[i] != source_host[i])
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void upload_em_fixed_sources(StepContext &ctx,
|
|
double **source_host,
|
|
size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
bind_em_fixed_source_slots(ctx);
|
|
if (em_source_cache_enabled() &&
|
|
em_fixed_source_cache_matches(ctx, source_host)) {
|
|
return;
|
|
}
|
|
for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(ctx.d_em_source[i],
|
|
source_host[i], bytes, cudaMemcpyHostToDevice));
|
|
}
|
|
if (em_source_cache_enabled()) {
|
|
for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i)
|
|
ctx.em_source_host[i] = source_host[i];
|
|
ctx.em_source_ready = true;
|
|
} else {
|
|
ctx.em_source_host.fill(nullptr);
|
|
ctx.em_source_ready = false;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_check_em_zero_fields(int *flag,
|
|
const double * __restrict__ Kpsi,
|
|
const double * __restrict__ Kphi,
|
|
const double * __restrict__ Ex,
|
|
const double * __restrict__ Ey,
|
|
const double * __restrict__ Ez,
|
|
const double * __restrict__ Bx,
|
|
const double * __restrict__ By,
|
|
const double * __restrict__ Bz,
|
|
const double * __restrict__ Jx,
|
|
const double * __restrict__ Jy,
|
|
const double * __restrict__ Jz,
|
|
const double * __restrict__ qchar,
|
|
int all)
|
|
{
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < all;
|
|
i += blockDim.x * gridDim.x)
|
|
{
|
|
if (Kpsi[i] != 0.0 || Kphi[i] != 0.0 ||
|
|
Ex[i] != 0.0 || Ey[i] != 0.0 || Ez[i] != 0.0 ||
|
|
Bx[i] != 0.0 || By[i] != 0.0 || Bz[i] != 0.0 ||
|
|
Jx[i] != 0.0 || Jy[i] != 0.0 || Jz[i] != 0.0 ||
|
|
qchar[i] != 0.0)
|
|
{
|
|
atomicExch(flag, 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
static bool em_zero_fast_path_enabled()
|
|
{
|
|
static int enabled = -1;
|
|
if (enabled < 0) {
|
|
const char *env = getenv("AMSS_CUDA_EM_ZERO_FASTPATH");
|
|
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
|
}
|
|
return enabled != 0;
|
|
}
|
|
|
|
static bool em_detect_zero_fast_path(StepContext &ctx,
|
|
int input_bank,
|
|
size_t all)
|
|
{
|
|
if (!em_zero_fast_path_enabled())
|
|
return false;
|
|
if (ctx.em_zero_fast_known)
|
|
return ctx.em_zero_fast;
|
|
if (input_bank < 0 || input_bank >= BSSN_RESIDENT_BANK_COUNT)
|
|
return false;
|
|
|
|
if (!g_em_zero_flag)
|
|
CUDA_CHECK(cudaMalloc(&g_em_zero_flag, sizeof(int)));
|
|
|
|
int h_flag = 1;
|
|
CUDA_CHECK(cudaMemcpy(g_em_zero_flag, &h_flag, sizeof(int), cudaMemcpyHostToDevice));
|
|
kern_check_em_zero_fields<<<grid(all), BLK>>>(
|
|
g_em_zero_flag,
|
|
ctx.d_resident[input_bank][24], ctx.d_resident[input_bank][25],
|
|
ctx.d_resident[input_bank][26], ctx.d_resident[input_bank][27],
|
|
ctx.d_resident[input_bank][28], ctx.d_resident[input_bank][29],
|
|
ctx.d_resident[input_bank][30], ctx.d_resident[input_bank][31],
|
|
ctx.d_em_source[0], ctx.d_em_source[1], ctx.d_em_source[2], ctx.d_em_source[3],
|
|
(int)all);
|
|
CUDA_CHECK(cudaMemcpy(&h_flag, g_em_zero_flag, sizeof(int), cudaMemcpyDeviceToHost));
|
|
ctx.em_zero_fast = (h_flag != 0);
|
|
ctx.em_zero_fast_known = true;
|
|
return ctx.em_zero_fast;
|
|
}
|
|
|
|
static void zero_em_output_slots_async(size_t all)
|
|
{
|
|
for (int i = BSSN_STATE_COUNT; i < BSSN_EM_STATE_COUNT; ++i)
|
|
CUDA_CHECK(cudaMemsetAsync(g_buf.slot[k_em_state_rhs_slots[i]], 0, all * sizeof(double)));
|
|
}
|
|
|
|
static void download_constraint_outputs(double **constraint_host_out, size_t all)
|
|
{
|
|
const size_t bytes = all * sizeof(double);
|
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
|
|
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
|
|
cudaMemcpyDeviceToHost));
|
|
for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
|
|
std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
|
}
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_escalar_cuda_rk4_substep(void *block_tag,
|
|
int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **state_host_out,
|
|
const double *propspeed,
|
|
const double *soa_flat,
|
|
const double *bbox,
|
|
double &dT,
|
|
double &T,
|
|
int &RK4,
|
|
int &apply_bam_bc,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps,
|
|
int &co,
|
|
int &keep_resident_state,
|
|
int &apply_enforce_ga,
|
|
double &chitiny)
|
|
{
|
|
(void)T;
|
|
if (RK4 < 0 || RK4 > 3) return 1;
|
|
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
|
|
double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0;
|
|
#ifdef fortran1
|
|
set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
|
#endif
|
|
#ifdef fortran2
|
|
SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
|
#endif
|
|
#ifdef fortran3
|
|
set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
|
#endif
|
|
if (fabs(escalar_a2) <= 1.0e-300 && g_dispatch.my_rank == 0) {
|
|
fprintf(stderr, "CUDA BSSN-EScalar requires nonzero FR a2; got %.17g\n", escalar_a2);
|
|
return 1;
|
|
}
|
|
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
int touch_xmin = 0, touch_xmax = 0;
|
|
int touch_ymin = 0, touch_ymax = 0;
|
|
int touch_zmin = 0, touch_zmax = 0;
|
|
|
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
|
if (Lev > 0) {
|
|
compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
const bool use_resident_state = (keep_resident_state != 0);
|
|
int input_bank = -1;
|
|
int output_bank = -1;
|
|
if (use_resident_state) {
|
|
input_bank = ensure_escalar_resident_bank(ctx, state_host_in, all, true);
|
|
output_bank = reserve_escalar_resident_output_bank(ctx, state_host_out, all, input_bank);
|
|
mark_resident_current_bank(ctx, input_bank);
|
|
mark_resident_next_bank(ctx, output_bank);
|
|
bind_escalar_state_input_slots(ctx.d_resident[input_bank]);
|
|
bind_escalar_state_output_slots(ctx.d_resident[output_bank]);
|
|
} else {
|
|
upload_escalar_state_inputs(state_host_in, all);
|
|
}
|
|
|
|
if (apply_enforce_ga) {
|
|
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
|
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
|
|
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
|
|
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
|
|
if (use_resident_state && input_bank >= 0)
|
|
set_resident_host_clean(ctx, input_bank, false);
|
|
}
|
|
|
|
if (RK4 == 0) {
|
|
if (use_resident_state) {
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank],
|
|
(size_t)BSSN_ESCALAR_STATE_COUNT * bytes,
|
|
cudaMemcpyDeviceToDevice));
|
|
} else {
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
|
|
(size_t)BSSN_STATE_COUNT * bytes,
|
|
cudaMemcpyDeviceToDevice));
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0[24], g_buf.slot[S_Sphi],
|
|
bytes, cudaMemcpyDeviceToDevice));
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0[25], g_buf.slot[S_Spi],
|
|
bytes, cudaMemcpyDeviceToDevice));
|
|
}
|
|
}
|
|
|
|
launch_rhs_pipeline((int)all, eps, co, true, escalar_a2);
|
|
|
|
if (apply_bam_bc) {
|
|
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
|
gpu_sommerfeld_routbam(g_buf.slot[k_escalar_state_input_slots[i]],
|
|
g_buf.slot[k_escalar_state_rhs_slots[i]],
|
|
propspeed[i],
|
|
soa_flat[3 * i + 0],
|
|
soa_flat[3 * i + 1],
|
|
soa_flat[3 * i + 2],
|
|
X, Y, Z, bbox, Symmetry);
|
|
}
|
|
}
|
|
|
|
gpu_escalar_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
|
if (Lev > 0) {
|
|
gpu_escalar_restore_patch_boundary_batch(ctx, (int)all,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
if (use_resident_state) {
|
|
ctx.resident_valid[output_bank] = true;
|
|
ctx.resident_age[output_bank] = ++ctx.resident_clock;
|
|
set_resident_host_clean(ctx, output_bank, false);
|
|
mark_resident_current_bank(ctx, output_bank);
|
|
update_state_ready(ctx);
|
|
} else {
|
|
download_escalar_state_outputs(state_host_out, all);
|
|
}
|
|
if (RK4 == 3)
|
|
ctx.matter_ready = false;
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **constraint_host_out,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps)
|
|
{
|
|
if (!state_host_in || !constraint_host_out) return 1;
|
|
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
|
|
double escalar_a2 = 3.0, escalar_phi0 = 0.0, escalar_r0 = 0.0, escalar_sigma0 = 0.0, escalar_l2 = 0.0;
|
|
#ifdef fortran1
|
|
set_escalar_parameter(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
|
#endif
|
|
#ifdef fortran2
|
|
SET_ESCALAR_PARAMETER(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
|
#endif
|
|
#ifdef fortran3
|
|
set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
|
#endif
|
|
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0);
|
|
upload_escalar_state_inputs(state_host_in, all);
|
|
launch_rhs_pipeline((int)all, eps, 0, true, escalar_a2);
|
|
|
|
#define D(s) g_buf.slot[s]
|
|
kern_escalar_constraint_fr<<<grid(all), BLK>>>(
|
|
D(S_chin1),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz), D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_rho),
|
|
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
|
|
D(S_Sphi),
|
|
escalar_a2,
|
|
D(S_f_arr));
|
|
#undef D
|
|
|
|
download_constraint_outputs(constraint_host_out, all);
|
|
CUDA_CHECK(cudaMemcpy(constraint_host_out[7], g_buf.slot[S_f_arr],
|
|
bytes, cudaMemcpyDeviceToHost));
|
|
(void)Lev;
|
|
return 0;
|
|
}
|
|
|
|
__global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
|
|
const double * __restrict__ src2,
|
|
const double * __restrict__ src3,
|
|
double * __restrict__ dst,
|
|
double c1,
|
|
double c2,
|
|
double c3,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int state = blockIdx.y;
|
|
if (state >= state_count) return;
|
|
const size_t off = (size_t)state * all;
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
|
i < all;
|
|
i += blockDim.x * gridDim.x)
|
|
{
|
|
const double v3 = src3 ? src3[off + i] : 0.0;
|
|
dst[off + i] = c1 * src1[off + i] + c2 * src2[off + i] + c3 * v3;
|
|
}
|
|
}
|
|
|
|
__device__ double interp_lagrange_weight(int idx, double x, int ordn)
|
|
{
|
|
double w = 1.0;
|
|
const double xi = (double)idx;
|
|
for (int j = 0; j < ordn; ++j) {
|
|
if (j == idx) continue;
|
|
w *= (x - (double)j) / (xi - (double)j);
|
|
}
|
|
return w;
|
|
}
|
|
|
|
__device__ void interp_axis_window(double p,
|
|
double x0,
|
|
double dx,
|
|
int n,
|
|
int ordn,
|
|
int symmetry,
|
|
int axis,
|
|
int &base,
|
|
double &local_x)
|
|
{
|
|
int cx_i = (int)((p - x0) / dx + 0.4) + 1;
|
|
int cx_b = cx_i - ordn / 2 + 1;
|
|
int cx_t = cx_b + ordn - 1;
|
|
int cmin = 1;
|
|
if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
|
|
cmin = -ordn / 2 + 1;
|
|
if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
|
|
cmin = -ordn / 2 + 1;
|
|
|
|
if (cx_b < cmin) {
|
|
cx_b = cmin;
|
|
cx_t = cx_b + ordn - 1;
|
|
}
|
|
if (cx_t > n) {
|
|
cx_t = n;
|
|
cx_b = cx_t + 1 - ordn;
|
|
}
|
|
|
|
base = cx_b;
|
|
if (cx_b > 0) {
|
|
const double xb = x0 + (double)(cx_b - 1) * dx;
|
|
local_x = (p - xb) / dx;
|
|
} else {
|
|
const int reflected = 1 - cx_b;
|
|
const double xb = x0 + (double)(reflected - 1) * dx;
|
|
local_x = (p + xb) / dx;
|
|
}
|
|
}
|
|
|
|
__device__ double load_interp_value(const double * __restrict__ mem,
|
|
int nx,
|
|
int ny,
|
|
int nz,
|
|
int all,
|
|
int state,
|
|
int fi,
|
|
int fj,
|
|
int fk,
|
|
const double * __restrict__ soa)
|
|
{
|
|
double sign = 1.0;
|
|
int ii = fi;
|
|
int jj = fj;
|
|
int kk = fk;
|
|
if (ii <= 0) {
|
|
ii = 1 - ii;
|
|
sign *= soa[0];
|
|
}
|
|
if (jj <= 0) {
|
|
jj = 1 - jj;
|
|
sign *= soa[1];
|
|
}
|
|
if (kk <= 0) {
|
|
kk = 1 - kk;
|
|
sign *= soa[2];
|
|
}
|
|
if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
|
|
return 0.0;
|
|
const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
|
|
return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
|
|
}
|
|
|
|
__global__ void kern_interp_state_point3(const double * __restrict__ mem,
|
|
double * __restrict__ out,
|
|
int nx,
|
|
int ny,
|
|
int nz,
|
|
int all,
|
|
int state0,
|
|
int state1,
|
|
int state2,
|
|
double x0,
|
|
double y0,
|
|
double z0,
|
|
double dx,
|
|
double dy,
|
|
double dz,
|
|
double px,
|
|
double py,
|
|
double pz,
|
|
int ordn,
|
|
int symmetry,
|
|
double soa00, double soa01, double soa02,
|
|
double soa10, double soa11, double soa12,
|
|
double soa20, double soa21, double soa22)
|
|
{
|
|
const int f = threadIdx.x;
|
|
if (f >= 3 || ordn <= 0 || ordn > 8)
|
|
return;
|
|
|
|
const int states[3] = {state0, state1, state2};
|
|
const double soa_all[9] = {
|
|
soa00, soa01, soa02,
|
|
soa10, soa11, soa12,
|
|
soa20, soa21, soa22
|
|
};
|
|
const double *soa = soa_all + 3 * f;
|
|
|
|
int ib, jb, kb;
|
|
double tx, ty, tz;
|
|
interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
|
|
interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
|
|
interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
|
|
|
|
double wx[8], wy[8], wz[8];
|
|
for (int i = 0; i < ordn; ++i) {
|
|
wx[i] = interp_lagrange_weight(i, tx, ordn);
|
|
wy[i] = interp_lagrange_weight(i, ty, ordn);
|
|
wz[i] = interp_lagrange_weight(i, tz, ordn);
|
|
}
|
|
|
|
double value = 0.0;
|
|
for (int k = 0; k < ordn; ++k) {
|
|
for (int j = 0; j < ordn; ++j) {
|
|
for (int i = 0; i < ordn; ++i) {
|
|
const double coeff = wx[i] * wy[j] * wz[k];
|
|
value += coeff * load_interp_value(mem, nx, ny, nz, all,
|
|
states[f],
|
|
ib + i, jb + j, kb + k,
|
|
soa);
|
|
}
|
|
}
|
|
}
|
|
out[f] = value;
|
|
}
|
|
|
|
__global__ void kern_interp_host_two_fields(const double * __restrict__ field0,
|
|
const double * __restrict__ field1,
|
|
const double * __restrict__ px,
|
|
const double * __restrict__ py,
|
|
const double * __restrict__ pz,
|
|
double * __restrict__ out,
|
|
int nx,
|
|
int ny,
|
|
int nz,
|
|
int all,
|
|
double x0,
|
|
double y0,
|
|
double z0,
|
|
double dx,
|
|
double dy,
|
|
double dz,
|
|
int npoints,
|
|
int ordn,
|
|
int symmetry,
|
|
double soa00, double soa01, double soa02,
|
|
double soa10, double soa11, double soa12)
|
|
{
|
|
const int p = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if (p >= npoints || ordn <= 0 || ordn > 8)
|
|
return;
|
|
|
|
int ib, jb, kb;
|
|
double tx, ty, tz;
|
|
interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx);
|
|
interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty);
|
|
interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz);
|
|
|
|
double wx[8], wy[8], wz[8];
|
|
for (int i = 0; i < ordn; ++i) {
|
|
wx[i] = interp_lagrange_weight(i, tx, ordn);
|
|
wy[i] = interp_lagrange_weight(i, ty, ordn);
|
|
wz[i] = interp_lagrange_weight(i, tz, ordn);
|
|
}
|
|
|
|
double v0 = 0.0;
|
|
double v1 = 0.0;
|
|
const double soa0[3] = {soa00, soa01, soa02};
|
|
const double soa1[3] = {soa10, soa11, soa12};
|
|
for (int k = 0; k < ordn; ++k) {
|
|
for (int j = 0; j < ordn; ++j) {
|
|
const double wyz = wy[j] * wz[k];
|
|
for (int i = 0; i < ordn; ++i) {
|
|
const double coeff = wx[i] * wyz;
|
|
v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0,
|
|
ib + i, jb + j, kb + k, soa0);
|
|
v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0,
|
|
ib + i, jb + j, kb + k, soa1);
|
|
}
|
|
}
|
|
}
|
|
out[2 * p] = v0;
|
|
out[2 * p + 1] = v1;
|
|
}
|
|
|
|
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int nx, int ny,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz,
|
|
int region_all,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int state_index = blockIdx.y;
|
|
if (state_index >= state_count) return;
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
|
dst[(size_t)state_index * region_all + local] =
|
|
src_mem[(size_t)state_index * all + src];
|
|
}
|
|
}
|
|
|
|
__global__ void kern_unpack_state_region_batch(double * __restrict__ dst_mem,
|
|
const double * __restrict__ src,
|
|
int nx, int ny,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz,
|
|
int region_all,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int state_index = blockIdx.y;
|
|
if (state_index >= state_count) return;
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
|
dst_mem[(size_t)state_index * all + dst] =
|
|
src[(size_t)state_index * region_all + local];
|
|
}
|
|
}
|
|
|
|
__global__ void kern_pack_state_segments_batch(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int nx, int ny,
|
|
const int * __restrict__ meta,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int segment = blockIdx.z;
|
|
const int state_index = blockIdx.y;
|
|
const int *m = meta + segment * 8;
|
|
const int i0 = m[0], j0 = m[1], k0 = m[2];
|
|
const int sx = m[3], sy = m[4];
|
|
const int region_all = m[6];
|
|
const int offset = m[7];
|
|
if (state_index >= state_count) return;
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int src = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
|
dst[(size_t)offset + (size_t)state_index * region_all + local] =
|
|
src_mem[(size_t)state_index * all + src];
|
|
}
|
|
}
|
|
|
|
__global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem,
|
|
const double * __restrict__ src,
|
|
int nx, int ny,
|
|
const int * __restrict__ meta,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int segment = blockIdx.z;
|
|
const int state_index = blockIdx.y;
|
|
const int *m = meta + segment * 8;
|
|
const int i0 = m[0], j0 = m[1], k0 = m[2];
|
|
const int sx = m[3], sy = m[4];
|
|
const int region_all = m[6];
|
|
const int offset = m[7];
|
|
if (state_index >= state_count) return;
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int dst = (i0 + ii) + (j0 + jj) * nx + (k0 + kk) * nx * ny;
|
|
dst_mem[(size_t)state_index * all + dst] =
|
|
src[(size_t)offset + (size_t)state_index * region_all + local];
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem,
|
|
int state_index,
|
|
int x, int y, int z,
|
|
int nx, int ny,
|
|
int all);
|
|
|
|
__global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int nx, int ny,
|
|
int sx, int sy, int sz,
|
|
int fi0, int fj0, int fk0,
|
|
int region_all,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int state_index = blockIdx.y;
|
|
if (state_index >= state_count) return;
|
|
#if ghost_width == 5
|
|
const double c1 = 35.0 / 65536.0;
|
|
const double c2 = -405.0 / 65536.0;
|
|
const double c3 = 567.0 / 16384.0;
|
|
const double c4 = -2205.0 / 16384.0;
|
|
const double c5 = 19845.0 / 32768.0;
|
|
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
|
|
const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
|
|
const int nst = 10;
|
|
#elif ghost_width == 4
|
|
const double c1 = -5.0 / 2048.0;
|
|
const double c2 = 49.0 / 2048.0;
|
|
const double c3 = -245.0 / 2048.0;
|
|
const double c4 = 1225.0 / 2048.0;
|
|
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
|
|
const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
|
|
const int nst = 8;
|
|
#elif ghost_width == 3
|
|
const double c1 = 3.0 / 256.0;
|
|
const double c2 = -25.0 / 256.0;
|
|
const double c3 = 75.0 / 128.0;
|
|
const int offs[6] = {-2, -1, 0, 1, 2, 3};
|
|
const double w[6] = {c1, c2, c3, c3, c2, c1};
|
|
const int nst = 6;
|
|
#else
|
|
const double c1 = -1.0 / 16.0;
|
|
const double c2 = 9.0 / 16.0;
|
|
const int offs[4] = {-1, 0, 1, 2};
|
|
const double w[4] = {c1, c2, c2, c1};
|
|
const int nst = 4;
|
|
#endif
|
|
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int fc_i = fi0 + 2 * ii;
|
|
const int fc_j = fj0 + 2 * jj;
|
|
const int fc_k = fk0 + 2 * kk;
|
|
double sum = 0.0;
|
|
for (int oz = 0; oz < nst; ++oz)
|
|
{
|
|
const int z = fc_k + offs[oz];
|
|
const double wz = w[oz];
|
|
for (int oy = 0; oy < nst; ++oy)
|
|
{
|
|
const int y = fc_j + offs[oy];
|
|
const double wyz = wz * w[oy];
|
|
for (int ox = 0; ox < nst; ++ox)
|
|
{
|
|
const int x = fc_i + offs[ox];
|
|
sum += wyz * w[ox] *
|
|
load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
|
|
}
|
|
}
|
|
}
|
|
dst[(size_t)state_index * region_all + local] = sum;
|
|
}
|
|
}
|
|
|
|
__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem,
|
|
int state_index,
|
|
int x, int y, int z,
|
|
int nx, int ny,
|
|
int all)
|
|
{
|
|
double s = 1.0;
|
|
if (x < 0) {
|
|
x = -x - 1;
|
|
s *= d_comm_state_soa[3 * state_index + 0];
|
|
}
|
|
if (y < 0) {
|
|
y = -y - 1;
|
|
s *= d_comm_state_soa[3 * state_index + 1];
|
|
}
|
|
if (z < 0) {
|
|
z = -z - 1;
|
|
s *= d_comm_state_soa[3 * state_index + 2];
|
|
}
|
|
const int src = x + y * nx + z * nx * ny;
|
|
return s * src_mem[(size_t)state_index * all + src];
|
|
}
|
|
|
|
__global__ void kern_restrict_state_segments_batch(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int nx, int ny,
|
|
const int * __restrict__ meta,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int segment = blockIdx.z;
|
|
const int state_index = blockIdx.y;
|
|
const int *m = meta + segment * 8;
|
|
const int sx = m[0], sy = m[1];
|
|
const int region_all = m[3];
|
|
const int offset = m[4];
|
|
const int fi0 = m[5], fj0 = m[6], fk0 = m[7];
|
|
if (state_index >= state_count) return;
|
|
#if ghost_width == 5
|
|
const double c1 = 35.0 / 65536.0;
|
|
const double c2 = -405.0 / 65536.0;
|
|
const double c3 = 567.0 / 16384.0;
|
|
const double c4 = -2205.0 / 16384.0;
|
|
const double c5 = 19845.0 / 32768.0;
|
|
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
|
|
const double w[10] = {c1, c2, c3, c4, c5, c5, c4, c3, c2, c1};
|
|
const int nst = 10;
|
|
#elif ghost_width == 4
|
|
const double c1 = -5.0 / 2048.0;
|
|
const double c2 = 49.0 / 2048.0;
|
|
const double c3 = -245.0 / 2048.0;
|
|
const double c4 = 1225.0 / 2048.0;
|
|
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
|
|
const double w[8] = {c1, c2, c3, c4, c4, c3, c2, c1};
|
|
const int nst = 8;
|
|
#elif ghost_width == 3
|
|
const double c1 = 3.0 / 256.0;
|
|
const double c2 = -25.0 / 256.0;
|
|
const double c3 = 75.0 / 128.0;
|
|
const int offs[6] = {-2, -1, 0, 1, 2, 3};
|
|
const double w[6] = {c1, c2, c3, c3, c2, c1};
|
|
const int nst = 6;
|
|
#else
|
|
const double c1 = -1.0 / 16.0;
|
|
const double c2 = 9.0 / 16.0;
|
|
const int offs[4] = {-1, 0, 1, 2};
|
|
const double w[4] = {c1, c2, c2, c1};
|
|
const int nst = 4;
|
|
#endif
|
|
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int fc_i = fi0 + 2 * ii;
|
|
const int fc_j = fj0 + 2 * jj;
|
|
const int fc_k = fk0 + 2 * kk;
|
|
double sum = 0.0;
|
|
for (int oz = 0; oz < nst; ++oz)
|
|
{
|
|
const int z = fc_k + offs[oz];
|
|
const double wz = w[oz];
|
|
for (int oy = 0; oy < nst; ++oy)
|
|
{
|
|
const int y = fc_j + offs[oy];
|
|
const double wyz = wz * w[oy];
|
|
for (int ox = 0; ox < nst; ++ox)
|
|
{
|
|
const int x = fc_i + offs[ox];
|
|
sum += wyz * w[ox] *
|
|
load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
|
|
}
|
|
}
|
|
}
|
|
dst[(size_t)offset + (size_t)state_index * region_all + local] = sum;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int nx, int ny,
|
|
int sx, int sy, int sz,
|
|
int ii0, int jj0, int kk0,
|
|
int lbc_i, int lbc_j, int lbc_k,
|
|
int region_all,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int state_index = blockIdx.y;
|
|
if (state_index >= state_count) return;
|
|
#if ghost_width == 5
|
|
const double c1 = 13585.0 / 33554432.0;
|
|
const double c2 = -159885.0 / 33554432.0;
|
|
const double c3 = 230945.0 / 8388608.0;
|
|
const double c4 = -969969.0 / 8388608.0;
|
|
const double c5 = 14549535.0 / 16777216.0;
|
|
const double c6 = 4849845.0 / 16777216.0;
|
|
const double c7 = -692835.0 / 8388608.0;
|
|
const double c8 = 188955.0 / 8388608.0;
|
|
const double c9 = -138567.0 / 33554432.0;
|
|
const double c10 = 12155.0 / 33554432.0;
|
|
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
|
|
const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
|
|
const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
|
|
const int nst = 10;
|
|
#elif ghost_width == 4
|
|
const double c1 = -495.0 / 262144.0;
|
|
const double c2 = 5005.0 / 262144.0;
|
|
const double c3 = -27027.0 / 262144.0;
|
|
const double c4 = 225225.0 / 262144.0;
|
|
const double c5 = 75075.0 / 262144.0;
|
|
const double c6 = -19305.0 / 262144.0;
|
|
const double c7 = 4095.0 / 262144.0;
|
|
const double c8 = -429.0 / 262144.0;
|
|
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
|
|
const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
|
|
const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
|
|
const int nst = 8;
|
|
#elif ghost_width == 3
|
|
const double c1 = 77.0 / 8192.0;
|
|
const double c2 = -693.0 / 8192.0;
|
|
const double c3 = 3465.0 / 4096.0;
|
|
const double c4 = 1155.0 / 4096.0;
|
|
const double c5 = -495.0 / 8192.0;
|
|
const double c6 = 63.0 / 8192.0;
|
|
const int offs[6] = {-2, -1, 0, 1, 2, 3};
|
|
const double wl[6] = {c1, c2, c3, c4, c5, c6};
|
|
const double wr[6] = {c6, c5, c4, c3, c2, c1};
|
|
const int nst = 6;
|
|
#else
|
|
const double c1 = -7.0 / 128.0;
|
|
const double c2 = 105.0 / 128.0;
|
|
const double c3 = 35.0 / 128.0;
|
|
const double c4 = -5.0 / 128.0;
|
|
const int offs[4] = {-1, 0, 1, 2};
|
|
const double wl[4] = {c1, c2, c3, c4};
|
|
const double wr[4] = {c4, c3, c2, c1};
|
|
const int nst = 4;
|
|
#endif
|
|
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int fine_i = ii0 + ii;
|
|
const int fine_j = jj0 + jj;
|
|
const int fine_k = kk0 + kk;
|
|
const int ci = fine_i / 2 - lbc_i;
|
|
const int cj = fine_j / 2 - lbc_j;
|
|
const int ck = fine_k / 2 - lbc_k;
|
|
const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr;
|
|
const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr;
|
|
const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr;
|
|
double sum = 0.0;
|
|
for (int oz = 0; oz < nst; ++oz)
|
|
{
|
|
const int z = ck + offs[oz];
|
|
const double wzv = wz[oz];
|
|
for (int oy = 0; oy < nst; ++oy)
|
|
{
|
|
const int y = cj + offs[oy];
|
|
const double wyz = wzv * wy[oy];
|
|
for (int ox = 0; ox < nst; ++ox)
|
|
{
|
|
const int x = ci + offs[ox];
|
|
sum += wyz * wx[ox] *
|
|
load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
|
|
}
|
|
}
|
|
}
|
|
dst[(size_t)state_index * region_all + local] = sum;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_prolong_state_segments_batch(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int nx, int ny,
|
|
const int * __restrict__ meta,
|
|
int state_count,
|
|
int all)
|
|
{
|
|
const int segment = blockIdx.z;
|
|
const int state_index = blockIdx.y;
|
|
const int *m = meta + segment * 11;
|
|
const int sx = m[0], sy = m[1];
|
|
const int region_all = m[3];
|
|
const int offset = m[4];
|
|
const int ii0 = m[5], jj0 = m[6], kk0 = m[7];
|
|
const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10];
|
|
if (state_index >= state_count) return;
|
|
#if ghost_width == 5
|
|
const double c1 = 13585.0 / 33554432.0;
|
|
const double c2 = -159885.0 / 33554432.0;
|
|
const double c3 = 230945.0 / 8388608.0;
|
|
const double c4 = -969969.0 / 8388608.0;
|
|
const double c5 = 14549535.0 / 16777216.0;
|
|
const double c6 = 4849845.0 / 16777216.0;
|
|
const double c7 = -692835.0 / 8388608.0;
|
|
const double c8 = 188955.0 / 8388608.0;
|
|
const double c9 = -138567.0 / 33554432.0;
|
|
const double c10 = 12155.0 / 33554432.0;
|
|
const int offs[10] = {-4, -3, -2, -1, 0, 1, 2, 3, 4, 5};
|
|
const double wl[10] = {c1, c2, c3, c4, c5, c6, c7, c8, c9, c10};
|
|
const double wr[10] = {c10, c9, c8, c7, c6, c5, c4, c3, c2, c1};
|
|
const int nst = 10;
|
|
#elif ghost_width == 4
|
|
const double c1 = -495.0 / 262144.0;
|
|
const double c2 = 5005.0 / 262144.0;
|
|
const double c3 = -27027.0 / 262144.0;
|
|
const double c4 = 225225.0 / 262144.0;
|
|
const double c5 = 75075.0 / 262144.0;
|
|
const double c6 = -19305.0 / 262144.0;
|
|
const double c7 = 4095.0 / 262144.0;
|
|
const double c8 = -429.0 / 262144.0;
|
|
const int offs[8] = {-3, -2, -1, 0, 1, 2, 3, 4};
|
|
const double wl[8] = {c1, c2, c3, c4, c5, c6, c7, c8};
|
|
const double wr[8] = {c8, c7, c6, c5, c4, c3, c2, c1};
|
|
const int nst = 8;
|
|
#elif ghost_width == 3
|
|
const double c1 = 77.0 / 8192.0;
|
|
const double c2 = -693.0 / 8192.0;
|
|
const double c3 = 3465.0 / 4096.0;
|
|
const double c4 = 1155.0 / 4096.0;
|
|
const double c5 = -495.0 / 8192.0;
|
|
const double c6 = 63.0 / 8192.0;
|
|
const int offs[6] = {-2, -1, 0, 1, 2, 3};
|
|
const double wl[6] = {c1, c2, c3, c4, c5, c6};
|
|
const double wr[6] = {c6, c5, c4, c3, c2, c1};
|
|
const int nst = 6;
|
|
#else
|
|
const double c1 = -7.0 / 128.0;
|
|
const double c2 = 105.0 / 128.0;
|
|
const double c3 = 35.0 / 128.0;
|
|
const double c4 = -5.0 / 128.0;
|
|
const int offs[4] = {-1, 0, 1, 2};
|
|
const double wl[4] = {c1, c2, c3, c4};
|
|
const double wr[4] = {c4, c3, c2, c1};
|
|
const int nst = 4;
|
|
#endif
|
|
|
|
for (int local = blockIdx.x * blockDim.x + threadIdx.x;
|
|
local < region_all;
|
|
local += blockDim.x * gridDim.x)
|
|
{
|
|
const int ii = local % sx;
|
|
const int jj = (local / sx) % sy;
|
|
const int kk = local / (sx * sy);
|
|
const int fine_i = ii0 + ii;
|
|
const int fine_j = jj0 + jj;
|
|
const int fine_k = kk0 + kk;
|
|
const int ci = fine_i / 2 - lbc_i;
|
|
const int cj = fine_j / 2 - lbc_j;
|
|
const int ck = fine_k / 2 - lbc_k;
|
|
const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr;
|
|
const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr;
|
|
const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr;
|
|
double sum = 0.0;
|
|
for (int oz = 0; oz < nst; ++oz)
|
|
{
|
|
const int z = ck + offs[oz];
|
|
const double wzv = wz[oz];
|
|
for (int oy = 0; oy < nst; ++oy)
|
|
{
|
|
const int y = cj + offs[oy];
|
|
const double wyz = wzv * wy[oy];
|
|
for (int ox = 0; ox < nst; ++ox)
|
|
{
|
|
const int x = ci + offs[ox];
|
|
sum += wyz * wx[ox] *
|
|
load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all);
|
|
}
|
|
}
|
|
}
|
|
dst[(size_t)offset + (size_t)state_index * region_all + local] = sum;
|
|
}
|
|
}
|
|
|
|
__global__ void kern_pack_state_subset(const double * __restrict__ src_mem,
|
|
double * __restrict__ dst,
|
|
int subset_count,
|
|
int all)
|
|
{
|
|
const int subset_slot = blockIdx.y;
|
|
if (subset_slot >= subset_count) return;
|
|
const int state_index = d_subset_state_indices[subset_slot];
|
|
for (int src = blockIdx.x * blockDim.x + threadIdx.x;
|
|
src < all;
|
|
src += blockDim.x * gridDim.x)
|
|
{
|
|
dst[(size_t)subset_slot * all + src] =
|
|
src_mem[(size_t)state_index * all + src];
|
|
}
|
|
}
|
|
|
|
__global__ void kern_unpack_state_subset(double * __restrict__ dst_mem,
|
|
const double * __restrict__ src,
|
|
int subset_count,
|
|
int all)
|
|
{
|
|
const int subset_slot = blockIdx.y;
|
|
if (subset_slot >= subset_count) return;
|
|
const int state_index = d_subset_state_indices[subset_slot];
|
|
for (int dst = blockIdx.x * blockDim.x + threadIdx.x;
|
|
dst < all;
|
|
dst += blockDim.x * gridDim.x)
|
|
{
|
|
dst_mem[(size_t)state_index * all + dst] =
|
|
src[(size_t)subset_slot * all + dst];
|
|
}
|
|
}
|
|
|
|
static void copy_state_region_cuda(void *block_tag,
|
|
int state_index,
|
|
double *host_state,
|
|
const int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz,
|
|
cudaMemcpyKind kind,
|
|
double **state_host_key = nullptr)
|
|
{
|
|
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
|
|
|
const size_t pitch = (size_t)ex[0] * sizeof(double);
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
kind == cudaMemcpyHostToDevice);
|
|
double *base_mem = ctx.d_resident_mem[bank];
|
|
|
|
cudaMemcpy3DParms p = {};
|
|
p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
|
|
p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
|
p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
|
|
|
if (kind == cudaMemcpyDeviceToHost) {
|
|
p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]);
|
|
p.dstPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
|
|
} else {
|
|
p.srcPtr = make_cudaPitchedPtr((void *)host_state, pitch, ex[0], ex[1]);
|
|
p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), pitch, ex[0], ex[1]);
|
|
}
|
|
CUDA_CHECK(cudaMemcpy3D(&p));
|
|
if (kind == cudaMemcpyHostToDevice) {
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, bank);
|
|
mark_resident_host_state_clean(ctx, bank, state_index, false);
|
|
update_state_ready(ctx);
|
|
} else {
|
|
mark_resident_host_state_clean(ctx, bank, state_index, true);
|
|
}
|
|
}
|
|
|
|
static void copy_state_region_packed_cuda(void *block_tag,
|
|
int state_index,
|
|
double *host_buffer,
|
|
const int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz,
|
|
cudaMemcpyKind kind,
|
|
double **state_host_key = nullptr,
|
|
int state_count = BSSN_STATE_COUNT)
|
|
{
|
|
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
|
|
|
const size_t src_pitch = (size_t)ex[0] * sizeof(double);
|
|
const size_t dst_pitch = (size_t)sx * sizeof(double);
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
kind == cudaMemcpyHostToDevice,
|
|
state_count);
|
|
double *base_mem = ctx.d_resident_mem[bank];
|
|
|
|
cudaMemcpy3DParms p = {};
|
|
p.extent = make_cudaExtent((size_t)sx * sizeof(double), (size_t)sy, (size_t)sz);
|
|
|
|
if (kind == cudaMemcpyDeviceToHost) {
|
|
p.srcPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]);
|
|
p.srcPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
|
p.dstPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
|
|
p.dstPos = make_cudaPos(0, 0, 0);
|
|
} else {
|
|
p.srcPtr = make_cudaPitchedPtr((void *)host_buffer, dst_pitch, sx, sy);
|
|
p.srcPos = make_cudaPos(0, 0, 0);
|
|
p.dstPtr = make_cudaPitchedPtr((void *)(base_mem + (size_t)state_index * all), src_pitch, ex[0], ex[1]);
|
|
p.dstPos = make_cudaPos((size_t)i0 * sizeof(double), j0, k0);
|
|
}
|
|
|
|
CUDA_CHECK(cudaMemcpy3D(&p));
|
|
if (kind == cudaMemcpyHostToDevice) {
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, bank);
|
|
mark_resident_host_state_clean(ctx, bank, state_index, false);
|
|
update_state_ready(ctx);
|
|
} else {
|
|
mark_resident_host_state_clean(ctx, bank, state_index, true);
|
|
}
|
|
}
|
|
|
|
static void copy_state_region_packed_batch_cuda(void *block_tag,
|
|
int state_count,
|
|
double *host_buffer,
|
|
const int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz,
|
|
cudaMemcpyKind kind,
|
|
double **state_host_key = nullptr)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
kind == cudaMemcpyHostToDevice,
|
|
state_count);
|
|
double *base_mem = ctx.d_resident_mem[bank];
|
|
const int region_all = sx * sy * sz;
|
|
const size_t total_doubles = (size_t)state_count * (size_t)region_all;
|
|
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
|
|
|
if (kind == cudaMemcpyDeviceToHost) {
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
|
|
base_mem, d_comm, ex[0], ex[1],
|
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
CUDA_CHECK(cudaMemcpy(host_buffer, d_comm,
|
|
total_doubles * sizeof(double),
|
|
cudaMemcpyDeviceToHost));
|
|
if (sx == ex[0] && sy == ex[1] && sz == ex[2] &&
|
|
i0 == 0 && j0 == 0 && k0 == 0) {
|
|
mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, true);
|
|
}
|
|
} else {
|
|
CUDA_CHECK(cudaMemcpy(d_comm, host_buffer,
|
|
total_doubles * sizeof(double),
|
|
cudaMemcpyHostToDevice));
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
|
|
base_mem, d_comm, ex[0], ex[1],
|
|
i0, j0, k0, sx, sy, sz, region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, bank);
|
|
mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, false);
|
|
update_state_ready(ctx);
|
|
}
|
|
}
|
|
|
|
static void download_resident_state_count(void *block_tag, int *ex, double **state_host_out, int state_count)
|
|
{
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
int bank = find_resident_bank_count(ctx, state_host_out, state_count);
|
|
bool bank_matches_output_key = (bank >= 0);
|
|
if (bank < 0) {
|
|
bank = (ctx.current_bank >= 0) ? ctx.current_bank : active_or_keyed_bank(ctx, nullptr, all, false);
|
|
}
|
|
mark_resident_current_bank(ctx, bank);
|
|
if (!bank_matches_output_key &&
|
|
resident_key_usable_count(state_host_out, state_count)) {
|
|
assign_resident_key_count(ctx, bank, state_host_out, state_count);
|
|
}
|
|
const bool profile = cuda_profile_enabled();
|
|
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
static int direct_download = -1;
|
|
if (direct_download < 0) {
|
|
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
|
|
direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
|
}
|
|
if (direct_download) {
|
|
if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
|
|
return;
|
|
for (int i = 0; i < state_count; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
|
|
bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
set_resident_host_clean(ctx, bank, true);
|
|
if (profile) {
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
stats.resident_download_calls++;
|
|
stats.resident_download_ms += cuda_profile_now_ms() - t0;
|
|
stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9;
|
|
}
|
|
return;
|
|
}
|
|
if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
|
|
return;
|
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
|
|
(size_t)state_count * bytes,
|
|
cudaMemcpyDeviceToHost));
|
|
for (int i = 0; i < state_count; ++i) {
|
|
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
|
}
|
|
set_resident_host_clean(ctx, bank, true);
|
|
if (profile) {
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
stats.resident_download_calls++;
|
|
stats.resident_download_ms += cuda_profile_now_ms() - t0;
|
|
stats.resident_download_gb += (double)((size_t)state_count * bytes) / 1.0e9;
|
|
}
|
|
}
|
|
|
|
static void download_resident_state(void *block_tag, int *ex, double **state_host_out)
|
|
{
|
|
download_resident_state_count(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static void upload_resident_state_count(void *block_tag, int *ex, double **state_host_in, int state_count)
|
|
{
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
int bank = -1;
|
|
if (state_count == BSSN_ESCALAR_STATE_COUNT) {
|
|
bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false);
|
|
bind_escalar_state_input_slots(ctx.d_resident[bank]);
|
|
upload_escalar_state_inputs(state_host_in, all);
|
|
} else if (state_count == BSSN_EM_STATE_COUNT) {
|
|
bank = ensure_em_resident_bank(ctx, state_host_in, all, false);
|
|
bind_em_state_input_slots(ctx.d_resident[bank]);
|
|
upload_em_state_inputs(state_host_in, all);
|
|
} else if (state_count == BSSN_STATE_COUNT) {
|
|
bank = ensure_resident_bank(ctx, state_host_in, all, false);
|
|
bind_state_input_slots(ctx.d_resident[bank]);
|
|
upload_state_inputs(state_host_in, all);
|
|
} else {
|
|
return;
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
set_resident_host_clean(ctx, bank, true);
|
|
mark_resident_current_bank(ctx, bank);
|
|
update_state_ready(ctx);
|
|
}
|
|
|
|
static void keep_only_resident_state_count(void *block_tag,
|
|
int *ex,
|
|
double **state_host_key,
|
|
int state_count)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return;
|
|
auto it = g_step_ctx.find(block_tag);
|
|
if (it == g_step_ctx.end()) return;
|
|
|
|
StepContext &ctx = it->second;
|
|
const int keep_bank = find_resident_bank_count(ctx, state_host_key, state_count);
|
|
if (keep_bank < 0 || !ctx.resident_valid[keep_bank])
|
|
return;
|
|
|
|
auto keep_clean = ctx.resident_host_clean[keep_bank];
|
|
|
|
for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
|
|
ctx.resident_valid[b] = false;
|
|
ctx.resident_host[b].fill(nullptr);
|
|
ctx.resident_host_clean[b].fill(0);
|
|
ctx.resident_age[b] = 0;
|
|
}
|
|
ctx.d_state_curr_mem = nullptr;
|
|
ctx.d_state_next_mem = nullptr;
|
|
ctx.d_state_curr.fill(nullptr);
|
|
ctx.d_state_next.fill(nullptr);
|
|
ctx.current_bank = -1;
|
|
ctx.resident_clock = 0;
|
|
ctx.matter_ready = false;
|
|
|
|
for (int i = 0; i < state_count; ++i) {
|
|
ctx.resident_host[keep_bank][i] = state_host_key[i];
|
|
ctx.resident_host_clean[keep_bank][i] = keep_clean[i] ? 1 : 0;
|
|
}
|
|
ctx.resident_valid[keep_bank] = true;
|
|
ctx.resident_age[keep_bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, keep_bank);
|
|
(void)ex;
|
|
update_state_ready(ctx);
|
|
}
|
|
|
|
static bool download_resident_state_count_if_present(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out,
|
|
int state_count);
|
|
|
|
static bool download_resident_state_if_present(void *block_tag, int *ex, double **state_host_out)
|
|
{
|
|
return download_resident_state_count_if_present(block_tag, ex, state_host_out, BSSN_STATE_COUNT);
|
|
}
|
|
|
|
static bool download_resident_state_count_if_present(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out,
|
|
int state_count)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return false;
|
|
auto it = g_step_ctx.find(block_tag);
|
|
if (it == g_step_ctx.end()) return false;
|
|
|
|
StepContext &ctx = it->second;
|
|
const int bank = find_resident_bank_count(ctx, state_host_out, state_count);
|
|
if (bank < 0 || !ctx.resident_valid[bank])
|
|
return false;
|
|
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
mark_resident_current_bank(ctx, bank);
|
|
if (resident_host_subset_clean(ctx, bank, state_count, nullptr))
|
|
return true;
|
|
|
|
static int direct_download = -1;
|
|
if (direct_download < 0) {
|
|
const char *env = getenv("AMSS_CUDA_DIRECT_STATE_DOWNLOAD");
|
|
direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
|
}
|
|
if (direct_download) {
|
|
for (int i = 0; i < state_count; ++i) {
|
|
CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i],
|
|
bytes, cudaMemcpyDeviceToHost));
|
|
}
|
|
CUDA_CHECK(cudaDeviceSynchronize());
|
|
} else {
|
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank],
|
|
(size_t)state_count * bytes,
|
|
cudaMemcpyDeviceToHost));
|
|
for (int i = 0; i < state_count; ++i) {
|
|
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
|
|
}
|
|
}
|
|
set_resident_host_clean(ctx, bank, true);
|
|
return true;
|
|
}
|
|
|
|
static void copy_state_subset(void *block_tag,
|
|
int *ex,
|
|
int subset_count,
|
|
const int *state_indices,
|
|
double **state_host,
|
|
cudaMemcpyKind kind)
|
|
{
|
|
if (subset_count <= 0) return;
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
double **full_key = (subset_count == BSSN_RESIDENT_STATE_CAPACITY) ? state_host : nullptr;
|
|
int bank = -1;
|
|
if (state_host) {
|
|
if (full_key) {
|
|
bank = (subset_count == BSSN_ESCALAR_STATE_COUNT)
|
|
? find_resident_bank_count(ctx, full_key, BSSN_ESCALAR_STATE_COUNT)
|
|
: find_resident_bank(ctx, full_key);
|
|
} else {
|
|
bank = find_resident_bank_subset(ctx, state_host, state_indices, subset_count);
|
|
}
|
|
if (kind == cudaMemcpyDeviceToHost &&
|
|
(bank < 0 || !ctx.resident_valid[bank])) {
|
|
bank = -1;
|
|
}
|
|
}
|
|
if (bank < 0) {
|
|
bank = active_or_keyed_bank(ctx, full_key, all,
|
|
kind == cudaMemcpyHostToDevice,
|
|
subset_count);
|
|
} else {
|
|
mark_resident_current_bank(ctx, bank);
|
|
}
|
|
double *base_mem = ctx.d_resident_mem[bank];
|
|
int active_state_indices[BSSN_RESIDENT_STATE_CAPACITY];
|
|
double *active_state_host[BSSN_RESIDENT_STATE_CAPACITY];
|
|
int active_count = 0;
|
|
|
|
for (int i = 0; i < subset_count; ++i) {
|
|
const int state_index = state_indices[i];
|
|
if (state_index < 0 || state_index >= BSSN_RESIDENT_STATE_CAPACITY) continue;
|
|
if (kind == cudaMemcpyDeviceToHost &&
|
|
ctx.resident_host_clean[bank][state_index])
|
|
continue;
|
|
if (!state_host[i]) continue;
|
|
active_state_indices[active_count] = state_index;
|
|
active_state_host[active_count] = state_host[i];
|
|
++active_count;
|
|
}
|
|
if (active_count <= 0) return;
|
|
|
|
const size_t total_doubles = (size_t)active_count * all;
|
|
double *d_comm = ensure_step_comm_buffer(ctx, total_doubles);
|
|
double *h_comm = ensure_step_host_comm_buffer(ctx, total_doubles);
|
|
|
|
CUDA_CHECK(cudaMemcpyToSymbol(d_subset_state_indices, active_state_indices,
|
|
(size_t)active_count * sizeof(int),
|
|
0, cudaMemcpyHostToDevice));
|
|
if (kind == cudaMemcpyDeviceToHost) {
|
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
|
|
kern_pack_state_subset<<<launch_grid, BLK>>>(
|
|
base_mem, d_comm, active_count, (int)all);
|
|
CUDA_CHECK(cudaMemcpy(h_comm, d_comm,
|
|
total_doubles * sizeof(double),
|
|
cudaMemcpyDeviceToHost));
|
|
for (int i = 0; i < active_count; ++i) {
|
|
std::memcpy(active_state_host[i],
|
|
h_comm + (size_t)i * all,
|
|
bytes);
|
|
}
|
|
mark_resident_host_subset_clean(ctx, bank, active_count,
|
|
active_state_indices, true);
|
|
} else {
|
|
for (int i = 0; i < active_count; ++i) {
|
|
std::memcpy(h_comm + (size_t)i * all,
|
|
active_state_host[i],
|
|
bytes);
|
|
}
|
|
CUDA_CHECK(cudaMemcpy(d_comm, h_comm,
|
|
total_doubles * sizeof(double),
|
|
cudaMemcpyHostToDevice));
|
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)active_count);
|
|
kern_unpack_state_subset<<<launch_grid, BLK>>>(
|
|
base_mem, d_comm, active_count, (int)all);
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, bank);
|
|
mark_resident_host_subset_clean(ctx, bank, active_count,
|
|
active_state_indices, true);
|
|
update_state_ready(ctx);
|
|
}
|
|
}
|
|
|
|
static bool has_resident_state(void *block_tag)
|
|
{
|
|
auto it = g_step_ctx.find(block_tag);
|
|
return it != g_step_ctx.end() && any_resident_bank_valid(it->second);
|
|
}
|
|
|
|
/* ================================================================== */
|
|
/* Main host function — drop-in replacement for bssn_rhs_c.C */
|
|
/* ================================================================== */
|
|
|
|
extern "C"
|
|
int f_compute_rhs_bssn(int *ex, double &T,
|
|
double *X, double *Y, double *Z,
|
|
double *chi, double *trK,
|
|
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
|
double *Gamx, double *Gamy, double *Gamz,
|
|
double *Lap, double *betax, double *betay, double *betaz,
|
|
double *dtSfx, double *dtSfy, double *dtSfz,
|
|
double *chi_rhs, double *trK_rhs,
|
|
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs,
|
|
double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
|
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs,
|
|
double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
|
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
|
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
|
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
|
double *rho, double *Sx, double *Sy, double *Sz,
|
|
double *Sxx, double *Sxy_m, double *Sxz, double *Syy, double *Syz_m, double *Szz,
|
|
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy,
|
|
double *Gamxyz, double *Gamxzz,
|
|
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy,
|
|
double *Gamyyz, double *Gamyzz,
|
|
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy,
|
|
double *Gamzyz, double *Gamzzz,
|
|
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
|
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
|
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
|
int &Symmetry, int &Lev, double &eps, int &co)
|
|
{
|
|
/* --- Multi-GPU: select device --- */
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
|
|
const int nx = ex[0], ny = ex[1], nz = ex[2];
|
|
const int all = nx * ny * nz;
|
|
const double SYM = 1.0, ANTI = -1.0;
|
|
|
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
|
|
|
/* --- Shorthand for device slot pointers --- */
|
|
#define D(s) g_buf.slot[s]
|
|
const size_t bytes = (size_t)all * sizeof(double);
|
|
|
|
/* --- H2D: stage all inputs, then one bulk copy --- */
|
|
double *h2d_src[] = {
|
|
chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
|
|
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
|
Gamx, Gamy, Gamz,
|
|
Lap, betax, betay, betaz,
|
|
dtSfx, dtSfy, dtSfz,
|
|
rho, Sx, Sy, Sz,
|
|
Sxx, Sxy_m, Sxz, Syy, Syz_m, Szz
|
|
};
|
|
static_assert((int)(sizeof(h2d_src) / sizeof(h2d_src[0])) == H2D_INPUT_SLOT_COUNT,
|
|
"h2d_src list must match H2D_INPUT_SLOT_COUNT");
|
|
for (int s = 0; s < H2D_INPUT_SLOT_COUNT; ++s) {
|
|
std::memcpy(g_buf.h_stage + (size_t)s * all, h2d_src[s], bytes);
|
|
}
|
|
CUDA_CHECK(cudaMemcpy(D(S_chi), g_buf.h_stage,
|
|
(size_t)H2D_INPUT_SLOT_COUNT * bytes,
|
|
cudaMemcpyHostToDevice));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 1: prep — alpn1, chin1, gxx, gyy, gzz */
|
|
/* ============================================================ */
|
|
kern_phase1_prep<<<grid(all),BLK>>>(
|
|
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
|
|
D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
|
|
|
|
/* 12x fderivs */
|
|
gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
|
|
gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
|
|
gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all);
|
|
|
|
/* ============================================================ */
|
|
/* Phase 2: metric RHS + inverse */
|
|
/* ============================================================ */
|
|
kern_phase2_metric_rhs<<<grid(all),BLK>>>(
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz),
|
|
D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_chi_rhs), D(S_gxx_rhs), D(S_gyy_rhs), D(S_gzz_rhs),
|
|
D(S_gxy_rhs), D(S_gyz_rhs), D(S_gxz_rhs));
|
|
|
|
kern_phase2_inverse<<<grid(all),BLK>>>(
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz));
|
|
|
|
/* Phase 3: Gamma constraint (co==0) */
|
|
if (co == 0) {
|
|
kern_phase3_gamma_constraint<<<grid(all),BLK>>>(
|
|
D(S_Gamx), D(S_Gamy), D(S_Gamz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Gmx_Res), D(S_Gmy_Res), D(S_Gmz_Res));
|
|
}
|
|
|
|
/* Phase 4: Christoffel symbols */
|
|
kern_phase4_christoffel<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
|
|
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
|
|
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz));
|
|
|
|
/* Phase 5+6: raise A in registers, then build Gamma_rhs part 1 */
|
|
kern_phase5_6_gamma_rhs_part1_fused<<<grid(all),BLK>>>(
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Kx), D(S_Ky), D(S_Kz),
|
|
D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
|
|
|
|
/* Phase 7: fdderivs(beta) + fderivs(Gamma) */
|
|
gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
|
|
D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
|
|
gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
|
|
D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
|
|
gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
|
|
D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);
|
|
|
|
/* Phase 8: Gamma_rhs part 2 */
|
|
kern_phase8_9_gamma_rhs_contract_fused<<<grid(all),BLK>>>(
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_betaxx),D(S_betaxy),D(S_betaxz),
|
|
D(S_betayx),D(S_betayy),D(S_betayz),
|
|
D(S_betazx),D(S_betazy),D(S_betazz),
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_Gamx_rhs),D(S_Gamy_rhs),D(S_Gamz_rhs),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz));
|
|
|
|
/* Phase 10: 6x fdderivs(metric) + Ricci contract */
|
|
{
|
|
double *src_fields[] = {D(S_dxx), D(S_dyy), D(S_dzz), D(S_gxy), D(S_gxz), D(S_gyz)};
|
|
double *dst_fields[] = {D(S_Rxx), D(S_Ryy), D(S_Rzz), D(S_Rxy), D(S_Rxz), D(S_Ryz)};
|
|
const int soa_signs[] = {
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)SYM, (int)SYM, (int)SYM,
|
|
(int)ANTI, (int)ANTI, (int)SYM,
|
|
(int)ANTI, (int)SYM, (int)ANTI,
|
|
(int)SYM, (int)ANTI, (int)ANTI
|
|
};
|
|
gpu_phase10_ricci_batch(D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
src_fields, dst_fields, soa_signs, all);
|
|
}
|
|
|
|
/* Phase 11: fused Ricci assembly */
|
|
kern_phase11_ricci_fused<<<grid(all),BLK>>>(
|
|
D(S_gxx),D(S_gxy),D(S_gxz),D(S_gyy),D(S_gyz),D(S_gzz),
|
|
D(S_gupxx),D(S_gupxy),D(S_gupxz),D(S_gupyy),D(S_gupyz),D(S_gupzz),
|
|
D(S_Gamxa),D(S_Gamya),D(S_Gamza),
|
|
D(S_Gamxx),D(S_Gamxy),D(S_Gamxz),
|
|
D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t),
|
|
D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t),
|
|
D(S_Gamxxx),D(S_Gamxxy),D(S_Gamxxz),
|
|
D(S_Gamxyy),D(S_Gamxyz),D(S_Gamxzz),
|
|
D(S_Gamyxx),D(S_Gamyxy),D(S_Gamyxz),
|
|
D(S_Gamyyy),D(S_Gamyyz),D(S_Gamyzz),
|
|
D(S_Gamzxx),D(S_Gamzxy),D(S_Gamzxz),
|
|
D(S_Gamzyy),D(S_Gamzyz),D(S_Gamzzz),
|
|
D(S_gxxx),D(S_gxyx),D(S_gxzx),D(S_gyyx),D(S_gyzx),D(S_gzzx),
|
|
D(S_gxxy),D(S_gxyy),D(S_gxzy),D(S_gyyy),D(S_gyzy),D(S_gzzy),
|
|
D(S_gxxz),D(S_gxyz),D(S_gxzz),D(S_gyyz),D(S_gyzz),D(S_gzzz),
|
|
D(S_Rxx),D(S_Rxy),D(S_Rxz),
|
|
D(S_Ryy),D(S_Ryz),D(S_Rzz));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 12/13: chi fdderivs + chi correction */
|
|
/* ============================================================ */
|
|
kern_phase12_13_chi_correction_fused<<<grid((size_t)all),BLK>>>(
|
|
D(S_chi), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz),
|
|
D(S_Ryy), D(S_Ryz), D(S_Rzz));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 14/15: fused trK_rhs, Aij_rhs, gauge */
|
|
/* ============================================================ */
|
|
kern_phase15_trK_Aij_gauge<<<grid(all),BLK>>>(
|
|
D(S_alpn1), D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gxx), D(S_gxy), D(S_gxz), D(S_gyy), D(S_gyz), D(S_gzz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Lapx), D(S_Lapy), D(S_Lapz),
|
|
D(S_betaxx), D(S_betaxy), D(S_betaxz),
|
|
D(S_betayx), D(S_betayy), D(S_betayz),
|
|
D(S_betazx), D(S_betazy), D(S_betazz),
|
|
D(S_rho),
|
|
D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
|
|
D(S_dtSfx), D(S_dtSfy), D(S_dtSfz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
D(S_dtSfx_rhs), D(S_dtSfy_rhs), D(S_dtSfz_rhs),
|
|
D(S_trK_rhs),
|
|
D(S_Axx_rhs), D(S_Axy_rhs), D(S_Axz_rhs),
|
|
D(S_Ayy_rhs), D(S_Ayz_rhs), D(S_Azz_rhs),
|
|
D(S_Lap_rhs),
|
|
D(S_betax_rhs), D(S_betay_rhs), D(S_betaz_rhs),
|
|
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs),
|
|
D(S_f_arr), D(S_S_arr));
|
|
|
|
/* ============================================================ */
|
|
/* Phase 16/17: advection + KO dissipation (shared ord=3 pack) */
|
|
/* ============================================================ */
|
|
gpu_lopsided_kodis_state_batch(eps, all);
|
|
|
|
/* ============================================================ */
|
|
/* Phase 18: Hamilton & momentum constraints (co==0) */
|
|
/* ============================================================ */
|
|
if (co == 0) {
|
|
/* 6x fderivs on Aij — reuse gxxx..gzzz slots for dA/dx output */
|
|
gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
|
|
gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
|
|
gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
|
|
gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
|
|
gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
|
|
|
|
kern_phase18_constraints<<<grid(all),BLK>>>(
|
|
D(S_chin1),
|
|
D(S_chix), D(S_chiy), D(S_chiz),
|
|
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
|
D(S_gupyy), D(S_gupyz), D(S_gupzz),
|
|
D(S_trK),
|
|
D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz),
|
|
D(S_Rxx), D(S_Rxy), D(S_Rxz), D(S_Ryy), D(S_Ryz), D(S_Rzz),
|
|
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
|
|
D(S_Kx), D(S_Ky), D(S_Kz),
|
|
D(S_Gamxxx), D(S_Gamxxy), D(S_Gamxxz),
|
|
D(S_Gamxyy), D(S_Gamxyz), D(S_Gamxzz),
|
|
D(S_Gamyxx), D(S_Gamyxy), D(S_Gamyxz),
|
|
D(S_Gamyyy), D(S_Gamyyz), D(S_Gamyzz),
|
|
D(S_Gamzxx), D(S_Gamzxy), D(S_Gamzxz),
|
|
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
|
/* dA/dx arrays */
|
|
D(S_gxxx), D(S_gxxy), D(S_gxxz),
|
|
D(S_gxyx), D(S_gxyy), D(S_gxyz),
|
|
D(S_gxzx), D(S_gxzy), D(S_gxzz),
|
|
D(S_gyyx), D(S_gyyy), D(S_gyyz),
|
|
D(S_gyzx), D(S_gyzy), D(S_gyzz),
|
|
D(S_gzzx), D(S_gzzy), D(S_gzzz),
|
|
D(S_ham_Res), D(S_movx_Res), D(S_movy_Res), D(S_movz_Res));
|
|
}
|
|
|
|
/* ============================================================ */
|
|
/* D2H: copy all output arrays back to host */
|
|
/* ============================================================ */
|
|
const int d2h_slot_count = D2H_BASE_SLOT_COUNT +
|
|
((co == 0) ? D2H_CONSTRAINT_SLOT_COUNT : 0);
|
|
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, D(S_chi_rhs),
|
|
(size_t)d2h_slot_count * bytes,
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
double *d2h_dst[] = {
|
|
chi_rhs, trK_rhs,
|
|
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
|
|
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
|
|
Gamx_rhs, Gamy_rhs, Gamz_rhs,
|
|
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
|
|
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
|
|
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
|
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
|
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
|
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz
|
|
};
|
|
static_assert((int)(sizeof(d2h_dst) / sizeof(d2h_dst[0])) == D2H_BASE_SLOT_COUNT,
|
|
"d2h_dst list must match D2H_BASE_SLOT_COUNT");
|
|
for (int s = 0; s < D2H_BASE_SLOT_COUNT; ++s) {
|
|
std::memcpy(d2h_dst[s], g_buf.h_stage + (size_t)s * all, bytes);
|
|
}
|
|
if (co == 0) {
|
|
double *d2h_dst_co[] = {
|
|
ham_Res, movx_Res, movy_Res, movz_Res, Gmx_Res, Gmy_Res, Gmz_Res
|
|
};
|
|
static_assert((int)(sizeof(d2h_dst_co) / sizeof(d2h_dst_co[0])) ==
|
|
D2H_CONSTRAINT_SLOT_COUNT,
|
|
"d2h_dst_co list must match D2H_CONSTRAINT_SLOT_COUNT");
|
|
for (int s = 0; s < D2H_CONSTRAINT_SLOT_COUNT; ++s) {
|
|
std::memcpy(d2h_dst_co[s],
|
|
g_buf.h_stage + (size_t)(D2H_BASE_SLOT_COUNT + s) * all,
|
|
bytes);
|
|
}
|
|
}
|
|
|
|
#undef D
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_rk4_substep(void *block_tag,
|
|
int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **state_host_out,
|
|
double **matter_host,
|
|
const double *propspeed,
|
|
const double *soa_flat,
|
|
const double *bbox,
|
|
double &dT,
|
|
double &T,
|
|
int &RK4,
|
|
int &apply_bam_bc,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps,
|
|
int &co,
|
|
int &use_zero_matter,
|
|
int &keep_resident_state,
|
|
int &apply_enforce_ga,
|
|
double &chitiny)
|
|
{
|
|
(void)T;
|
|
|
|
if (RK4 < 0 || RK4 > 3) return 1;
|
|
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
const bool profile = cuda_profile_enabled();
|
|
const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
double state_ms = 0.0;
|
|
double matter_ms = 0.0;
|
|
double rhs_ms = 0.0;
|
|
double bc_ms = 0.0;
|
|
double finalize_ms = 0.0;
|
|
double output_ms = 0.0;
|
|
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
int touch_xmin = 0, touch_xmax = 0;
|
|
int touch_ymin = 0, touch_ymax = 0;
|
|
int touch_zmin = 0, touch_zmax = 0;
|
|
|
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
|
if (Lev > 0) {
|
|
compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
const bool use_resident_state = (keep_resident_state != 0);
|
|
int input_bank = -1;
|
|
int output_bank = -1;
|
|
if (use_resident_state) {
|
|
input_bank = ensure_resident_bank(ctx, state_host_in, all, true);
|
|
output_bank = reserve_resident_output_bank(ctx, state_host_out, all, input_bank);
|
|
mark_resident_current_bank(ctx, input_bank);
|
|
mark_resident_next_bank(ctx, output_bank);
|
|
bind_state_input_slots(ctx.d_resident[input_bank]);
|
|
bind_state_output_slots(ctx.d_resident[output_bank]);
|
|
}
|
|
double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (!use_resident_state) {
|
|
upload_state_inputs(state_host_in, all);
|
|
}
|
|
if (apply_enforce_ga) {
|
|
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
|
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
|
|
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
|
|
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
|
|
if (use_resident_state && input_bank >= 0)
|
|
set_resident_host_clean(ctx, input_bank, false);
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
state_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (RK4 == 0) {
|
|
if (use_zero_matter) {
|
|
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
|
|
} else {
|
|
upload_matter_cache(ctx, matter_host, all);
|
|
}
|
|
const double *state0_src = use_resident_state
|
|
? ctx.d_resident_mem[input_bank]
|
|
: g_buf.slot[S_chi];
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, state0_src,
|
|
(size_t)BSSN_STATE_COUNT * bytes,
|
|
cudaMemcpyDeviceToDevice));
|
|
} else if (use_zero_matter) {
|
|
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
|
|
} else {
|
|
upload_matter_cache(ctx, matter_host, all);
|
|
}
|
|
bind_matter_slots(ctx);
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
matter_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
launch_rhs_pipeline((int)all, eps, co);
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
rhs_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (apply_bam_bc) {
|
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
|
gpu_sommerfeld_routbam(g_buf.slot[k_state_input_slots[i]],
|
|
g_buf.slot[k_state_rhs_slots[i]],
|
|
propspeed[i],
|
|
soa_flat[3 * i + 0],
|
|
soa_flat[3 * i + 1],
|
|
soa_flat[3 * i + 2],
|
|
X, Y, Z, bbox, Symmetry);
|
|
}
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
bc_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
|
if (Lev > 0) {
|
|
gpu_copy_patch_boundary_batch((int)all,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
finalize_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (use_resident_state) {
|
|
ctx.resident_valid[output_bank] = true;
|
|
ctx.resident_age[output_bank] = ++ctx.resident_clock;
|
|
set_resident_host_clean(ctx, output_bank, false);
|
|
mark_resident_current_bank(ctx, output_bank);
|
|
update_state_ready(ctx);
|
|
} else {
|
|
download_state_outputs(state_host_out, all);
|
|
}
|
|
if (RK4 == 3) {
|
|
ctx.matter_ready = false; /* invalidate matter cache for next timestep */
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
output_ms += cuda_profile_now_ms() - t0;
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
stats.calls++;
|
|
stats.total_ms += cuda_profile_now_ms() - t_total0;
|
|
stats.state_ms += state_ms;
|
|
stats.matter_ms += matter_ms;
|
|
stats.rhs_ms += rhs_ms;
|
|
stats.bc_ms += bc_ms;
|
|
stats.finalize_ms += finalize_ms;
|
|
stats.output_ms += output_ms;
|
|
cuda_profile_maybe_log();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_em_cuda_rk4_substep(void *block_tag,
|
|
int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **state_host_out,
|
|
double **source_host,
|
|
const double *propspeed,
|
|
const double *soa_flat,
|
|
const double *bbox,
|
|
double &dT,
|
|
double &T,
|
|
int &RK4,
|
|
int &apply_bam_bc,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps,
|
|
int &co,
|
|
int &keep_resident_state,
|
|
int &apply_enforce_ga,
|
|
double &chitiny)
|
|
{
|
|
(void)T;
|
|
if (RK4 < 0 || RK4 > 3) return 1;
|
|
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
const bool profile = cuda_profile_enabled();
|
|
const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
double state_ms = 0.0;
|
|
double em_ms = 0.0;
|
|
double rhs_ms = 0.0;
|
|
double bc_ms = 0.0;
|
|
double finalize_ms = 0.0;
|
|
double output_ms = 0.0;
|
|
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const size_t bytes = all * sizeof(double);
|
|
int touch_xmin = 0, touch_xmax = 0;
|
|
int touch_ymin = 0, touch_ymax = 0;
|
|
int touch_zmin = 0, touch_zmax = 0;
|
|
|
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
|
if (Lev > 0) {
|
|
compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
const bool use_resident_state = (keep_resident_state != 0);
|
|
int input_bank = -1;
|
|
int output_bank = -1;
|
|
if (use_resident_state) {
|
|
input_bank = ensure_em_resident_bank(ctx, state_host_in, all, true);
|
|
output_bank = reserve_em_resident_output_bank(ctx, state_host_out, all, input_bank);
|
|
mark_resident_current_bank(ctx, input_bank);
|
|
mark_resident_next_bank(ctx, output_bank);
|
|
bind_em_state_input_slots(ctx.d_resident[input_bank]);
|
|
bind_em_state_output_slots(ctx.d_resident[output_bank]);
|
|
} else {
|
|
upload_em_state_inputs(state_host_in, all);
|
|
}
|
|
upload_em_fixed_sources(ctx, source_host, all);
|
|
const bool use_em_zero_fast =
|
|
use_resident_state && em_detect_zero_fast_path(ctx, input_bank, all);
|
|
|
|
if (apply_enforce_ga) {
|
|
kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
|
|
g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
|
|
g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
|
|
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
|
|
if (use_resident_state && input_bank >= 0)
|
|
set_resident_host_clean(ctx, input_bank, false);
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
state_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (RK4 == 0) {
|
|
if (use_resident_state) {
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank],
|
|
(size_t)BSSN_EM_STATE_COUNT * bytes,
|
|
cudaMemcpyDeviceToDevice));
|
|
} else {
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
CUDA_CHECK(cudaMemcpy(ctx.d_state0[i], g_buf.slot[k_em_state_input_slots[i]],
|
|
bytes, cudaMemcpyDeviceToDevice));
|
|
}
|
|
}
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
state_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (use_em_zero_fast) {
|
|
zero_matter_cache(ctx, all);
|
|
bind_matter_slots(ctx);
|
|
zero_em_output_slots_async(all);
|
|
} else {
|
|
gpu_em_rhs_sources((int)all, eps);
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
em_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
launch_rhs_pipeline((int)all, eps, co);
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
rhs_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (apply_bam_bc) {
|
|
for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
|
|
gpu_sommerfeld_routbam(g_buf.slot[k_em_state_input_slots[i]],
|
|
g_buf.slot[k_em_state_rhs_slots[i]],
|
|
propspeed[i],
|
|
soa_flat[3 * i + 0],
|
|
soa_flat[3 * i + 1],
|
|
soa_flat[3 * i + 2],
|
|
X, Y, Z, bbox, Symmetry);
|
|
}
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
bc_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (use_em_zero_fast)
|
|
gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
|
else
|
|
gpu_em_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
|
if (Lev > 0) {
|
|
if (use_em_zero_fast)
|
|
gpu_copy_patch_boundary_batch((int)all,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
else
|
|
gpu_em_restore_patch_boundary_batch(ctx, (int)all,
|
|
touch_xmin, touch_xmax,
|
|
touch_ymin, touch_ymax,
|
|
touch_zmin, touch_zmax);
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
finalize_ms += cuda_profile_now_ms() - t0;
|
|
}
|
|
|
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (use_resident_state) {
|
|
ctx.resident_valid[output_bank] = true;
|
|
ctx.resident_age[output_bank] = ++ctx.resident_clock;
|
|
set_resident_host_clean(ctx, output_bank, false);
|
|
mark_resident_current_bank(ctx, output_bank);
|
|
update_state_ready(ctx);
|
|
} else {
|
|
download_em_state_outputs(state_host_out, all);
|
|
}
|
|
if (profile) {
|
|
cuda_profile_sync();
|
|
output_ms += cuda_profile_now_ms() - t0;
|
|
CudaProfileStats &stats = cuda_profile_stats();
|
|
stats.calls++;
|
|
stats.total_ms += cuda_profile_now_ms() - t_total0;
|
|
stats.state_ms += state_ms;
|
|
stats.matter_ms += em_ms;
|
|
stats.rhs_ms += rhs_ms;
|
|
stats.bc_ms += bc_ms;
|
|
stats.finalize_ms += finalize_ms;
|
|
stats.output_ms += output_ms;
|
|
cuda_profile_maybe_log();
|
|
}
|
|
if (RK4 == 3)
|
|
ctx.matter_ready = false;
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_em_cuda_resident_zero_fast_state(void *block_tag)
|
|
{
|
|
auto it = g_step_ctx.find(block_tag);
|
|
if (it == g_step_ctx.end())
|
|
return 0;
|
|
const StepContext &ctx = it->second;
|
|
return (ctx.em_zero_fast_known && ctx.em_zero_fast) ? 1 : 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
|
int state_index,
|
|
double *host_state,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_cuda(block_tag, state_index, host_state, ex,
|
|
i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_copy_state_region_from_host(void *block_tag,
|
|
int state_index,
|
|
double *host_state,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_cuda(block_tag, state_index, host_state, ex,
|
|
i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_download_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
download_resident_state(block_tag, ex, state_host_out);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_escalar_cuda_download_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
download_resident_state_count(block_tag, ex, state_host_out, BSSN_ESCALAR_STATE_COUNT);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_upload_resident_state_count(void *block_tag,
|
|
int *ex,
|
|
double **state_host_in,
|
|
int state_count)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return 1;
|
|
upload_resident_state_count(block_tag, ex, state_host_in, state_count);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_in)
|
|
{
|
|
return bssn_cuda_upload_resident_state_count(block_tag, ex, state_host_in,
|
|
BSSN_ESCALAR_STATE_COUNT);
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
|
|
int *ex,
|
|
double **state_host_key,
|
|
int state_count)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return 1;
|
|
keep_only_resident_state_count(block_tag, ex, state_host_key, state_count);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_key)
|
|
{
|
|
return bssn_cuda_keep_only_resident_state_count(block_tag, ex, state_host_key,
|
|
BSSN_ESCALAR_STATE_COUNT);
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out,
|
|
int state_count)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return 1;
|
|
download_resident_state_count_if_present(block_tag, ex, state_host_out, state_count);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_download_resident_state_if_present(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (!block_tag || !ex || !state_host_out) return 1;
|
|
download_resident_state_if_present(block_tag, ex, state_host_out);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_download_constraint_outputs(int *ex,
|
|
double **constraint_host_out)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
download_constraint_outputs(constraint_host_out, all);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
|
int state_index,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz, cudaMemcpyDeviceToHost);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_interp_state_point3(void *block_tag,
|
|
int *ex,
|
|
int state0,
|
|
int state1,
|
|
int state2,
|
|
double x0,
|
|
double y0,
|
|
double z0,
|
|
double dx,
|
|
double dy,
|
|
double dz,
|
|
double px,
|
|
double py,
|
|
double pz,
|
|
int ordn,
|
|
int symmetry,
|
|
double **state_host_key,
|
|
const double *soa3,
|
|
double *out3)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (!block_tag || !ex || !out3 || !soa3)
|
|
return 1;
|
|
if (state0 < 0 || state0 >= BSSN_RESIDENT_STATE_CAPACITY ||
|
|
state1 < 0 || state1 >= BSSN_RESIDENT_STATE_CAPACITY ||
|
|
state2 < 0 || state2 >= BSSN_RESIDENT_STATE_CAPACITY)
|
|
return 1;
|
|
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
|
|
ordn <= 0 || ordn > 8 ||
|
|
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
|
return 1;
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int interp_states[3] = {state0, state1, state2};
|
|
const int bank = find_resident_bank_subset(ctx, state_host_key, interp_states, 3);
|
|
if (bank < 0 || !ctx.resident_valid[bank])
|
|
return 1;
|
|
|
|
double *d_out = ensure_step_comm_buffer(ctx, 3);
|
|
kern_interp_state_point3<<<1, 3>>>(
|
|
ctx.d_resident_mem[bank], d_out,
|
|
ex[0], ex[1], ex[2], (int)all,
|
|
state0, state1, state2,
|
|
x0, y0, z0, dx, dy, dz,
|
|
px, py, pz, ordn, symmetry,
|
|
soa3[0], soa3[1], soa3[2],
|
|
soa3[3], soa3[4], soa3[5],
|
|
soa3[6], soa3[7], soa3[8]);
|
|
CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_interp_host_two_fields(void *block_tag,
|
|
int *ex,
|
|
double *field0,
|
|
double *field1,
|
|
double x0,
|
|
double y0,
|
|
double z0,
|
|
double dx,
|
|
double dy,
|
|
double dz,
|
|
const double *px,
|
|
const double *py,
|
|
const double *pz,
|
|
int npoints,
|
|
int ordn,
|
|
int symmetry,
|
|
const double *soa6,
|
|
double *out_interleaved)
|
|
{
|
|
(void)block_tag;
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 ||
|
|
!out_interleaved || npoints <= 0)
|
|
return 1;
|
|
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
|
|
ordn <= 0 || ordn > 8 ||
|
|
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
|
return 1;
|
|
|
|
const int all = ex[0] * ex[1] * ex[2];
|
|
const size_t field_bytes = (size_t)all * sizeof(double);
|
|
const size_t point_bytes = (size_t)npoints * sizeof(double);
|
|
const size_t out_bytes = (size_t)2 * npoints * sizeof(double);
|
|
|
|
double *d_field0 = nullptr;
|
|
double *d_field1 = nullptr;
|
|
double *d_px = nullptr;
|
|
double *d_py = nullptr;
|
|
double *d_pz = nullptr;
|
|
double *d_out = nullptr;
|
|
CUDA_CHECK(cudaMalloc(&d_field0, field_bytes));
|
|
CUDA_CHECK(cudaMalloc(&d_field1, field_bytes));
|
|
CUDA_CHECK(cudaMalloc(&d_px, point_bytes));
|
|
CUDA_CHECK(cudaMalloc(&d_py, point_bytes));
|
|
CUDA_CHECK(cudaMalloc(&d_pz, point_bytes));
|
|
CUDA_CHECK(cudaMalloc(&d_out, out_bytes));
|
|
|
|
CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice));
|
|
|
|
const int threads = 256;
|
|
const int blocks = (npoints + threads - 1) / threads;
|
|
kern_interp_host_two_fields<<<blocks, threads>>>(
|
|
d_field0, d_field1, d_px, d_py, d_pz, d_out,
|
|
ex[0], ex[1], ex[2], all,
|
|
x0, y0, z0, dx, dy, dz,
|
|
npoints, ordn, symmetry,
|
|
soa6[0], soa6[1], soa6[2],
|
|
soa6[3], soa6[4], soa6[5]);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost));
|
|
|
|
cudaFree(d_out);
|
|
cudaFree(d_pz);
|
|
cudaFree(d_py);
|
|
cudaFree(d_px);
|
|
cudaFree(d_field1);
|
|
cudaFree(d_field0);
|
|
return 0;
|
|
}
|
|
|
|
__global__ void kern_shell_pack_host_fields(double **fields,
|
|
const int *block_shapes,
|
|
const int *point_block,
|
|
const int *point_dimh,
|
|
const int *point_dumyd,
|
|
const int *point_sind,
|
|
const double *point_coef,
|
|
double *out,
|
|
int npoints,
|
|
int nvars,
|
|
int ordn)
|
|
{
|
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
|
const int total = npoints * nvars;
|
|
if (tid >= total) return;
|
|
|
|
const int p = tid / nvars;
|
|
const int v = tid - p * nvars;
|
|
const int b = point_block[p];
|
|
const int *shape = block_shapes + 3 * b;
|
|
const int *s = point_sind + 3 * p;
|
|
const double *coef = point_coef + 3 * ordn * p;
|
|
const double *f = fields[b * nvars + v];
|
|
|
|
const int nx = shape[0];
|
|
const int ny = shape[1];
|
|
const int nz = shape[2];
|
|
const int dimh = point_dimh[p];
|
|
const int dumyd = point_dumyd[p];
|
|
double sum = 0.0;
|
|
|
|
if (dimh == 3) {
|
|
const double *cx = coef;
|
|
const double *cy = coef + ordn;
|
|
const double *cz = coef + 2 * ordn;
|
|
for (int kk = 0; kk < ordn; ++kk)
|
|
for (int jj = 0; jj < ordn; ++jj)
|
|
for (int ii = 0; ii < ordn; ++ii) {
|
|
const int idx = (s[0] + ii) + nx * ((s[1] + jj) + ny * (s[2] + kk));
|
|
sum += cx[ii] * cy[jj] * cz[kk] * f[idx];
|
|
}
|
|
} else if (dimh == 1 && dumyd == 1) {
|
|
for (int ii = 0; ii < ordn; ++ii) {
|
|
const int idx = (s[0] + ii) + nx * (s[1] + ny * s[2]);
|
|
sum += coef[ii] * f[idx];
|
|
}
|
|
} else if (dimh == 1 && dumyd == 0) {
|
|
for (int jj = 0; jj < ordn; ++jj) {
|
|
const int idx = s[1] + nx * ((s[0] + jj) + ny * s[2]);
|
|
sum += coef[jj] * f[idx];
|
|
}
|
|
}
|
|
|
|
out[tid] = sum;
|
|
}
|
|
|
|
struct ShellPackCachedField {
|
|
double *device;
|
|
size_t bytes;
|
|
int generation;
|
|
};
|
|
|
|
static std::unordered_map<const double *, ShellPackCachedField> g_shell_pack_cache;
|
|
static int g_shell_pack_generation = 0;
|
|
|
|
extern "C"
|
|
void bssn_cuda_shell_pack_cache_begin()
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
for (auto &kv : g_shell_pack_cache)
|
|
cudaFree(kv.second.device);
|
|
g_shell_pack_cache.clear();
|
|
++g_shell_pack_generation;
|
|
}
|
|
|
|
extern "C"
|
|
void bssn_cuda_shell_pack_cache_end()
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
for (auto &kv : g_shell_pack_cache)
|
|
cudaFree(kv.second.device);
|
|
g_shell_pack_cache.clear();
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_shell_pack_host_fields(int npoints,
|
|
int nvars,
|
|
int nblocks,
|
|
int ordn,
|
|
double **block_var_fields,
|
|
int *block_shapes,
|
|
int *point_block,
|
|
int *point_dimh,
|
|
int *point_dumyd,
|
|
int *point_sind,
|
|
double *point_coef,
|
|
double *out)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (npoints <= 0 || nvars <= 0 || nblocks <= 0 || ordn <= 0 || ordn > 8 ||
|
|
!block_var_fields || !block_shapes || !point_block || !point_dimh ||
|
|
!point_dumyd || !point_sind || !point_coef || !out)
|
|
return 1;
|
|
|
|
const int field_count = nblocks * nvars;
|
|
std::vector<double *> h_device_fields((size_t)field_count, nullptr);
|
|
double **d_fields = nullptr;
|
|
int *d_block_shapes = nullptr;
|
|
int *d_point_block = nullptr;
|
|
int *d_point_dimh = nullptr;
|
|
int *d_point_dumyd = nullptr;
|
|
int *d_point_sind = nullptr;
|
|
double *d_point_coef = nullptr;
|
|
double *d_out = nullptr;
|
|
|
|
for (int b = 0; b < nblocks; ++b) {
|
|
const size_t all = (size_t)block_shapes[3 * b] *
|
|
(size_t)block_shapes[3 * b + 1] *
|
|
(size_t)block_shapes[3 * b + 2];
|
|
const size_t bytes = all * sizeof(double);
|
|
for (int v = 0; v < nvars; ++v) {
|
|
const int idx = b * nvars + v;
|
|
double *host_ptr = block_var_fields[idx];
|
|
if (!host_ptr) return 1;
|
|
auto it = g_shell_pack_cache.find(host_ptr);
|
|
if (it != g_shell_pack_cache.end() &&
|
|
it->second.bytes == bytes &&
|
|
it->second.generation == g_shell_pack_generation) {
|
|
h_device_fields[idx] = it->second.device;
|
|
} else {
|
|
double *device_ptr = nullptr;
|
|
CUDA_CHECK(cudaMalloc(&device_ptr, bytes));
|
|
CUDA_CHECK(cudaMemcpy(device_ptr, host_ptr, bytes, cudaMemcpyHostToDevice));
|
|
g_shell_pack_cache[host_ptr] = {device_ptr, bytes, g_shell_pack_generation};
|
|
h_device_fields[idx] = device_ptr;
|
|
}
|
|
}
|
|
}
|
|
|
|
CUDA_CHECK(cudaMalloc(&d_fields, (size_t)field_count * sizeof(double *)));
|
|
CUDA_CHECK(cudaMemcpy(d_fields, h_device_fields.data(),
|
|
(size_t)field_count * sizeof(double *),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMalloc(&d_block_shapes, (size_t)nblocks * 3 * sizeof(int)));
|
|
CUDA_CHECK(cudaMemcpy(d_block_shapes, block_shapes,
|
|
(size_t)nblocks * 3 * sizeof(int),
|
|
cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMalloc(&d_point_block, (size_t)npoints * sizeof(int)));
|
|
CUDA_CHECK(cudaMalloc(&d_point_dimh, (size_t)npoints * sizeof(int)));
|
|
CUDA_CHECK(cudaMalloc(&d_point_dumyd, (size_t)npoints * sizeof(int)));
|
|
CUDA_CHECK(cudaMalloc(&d_point_sind, (size_t)npoints * 3 * sizeof(int)));
|
|
CUDA_CHECK(cudaMalloc(&d_point_coef, (size_t)npoints * 3 * ordn * sizeof(double)));
|
|
CUDA_CHECK(cudaMalloc(&d_out, (size_t)npoints * nvars * sizeof(double)));
|
|
|
|
CUDA_CHECK(cudaMemcpy(d_point_block, point_block,
|
|
(size_t)npoints * sizeof(int), cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_point_dimh, point_dimh,
|
|
(size_t)npoints * sizeof(int), cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_point_dumyd, point_dumyd,
|
|
(size_t)npoints * sizeof(int), cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_point_sind, point_sind,
|
|
(size_t)npoints * 3 * sizeof(int), cudaMemcpyHostToDevice));
|
|
CUDA_CHECK(cudaMemcpy(d_point_coef, point_coef,
|
|
(size_t)npoints * 3 * ordn * sizeof(double),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
const int total = npoints * nvars;
|
|
const int threads = 256;
|
|
const int blocks = (total + threads - 1) / threads;
|
|
kern_shell_pack_host_fields<<<blocks, threads>>>(
|
|
d_fields, d_block_shapes, d_point_block, d_point_dimh,
|
|
d_point_dumyd, d_point_sind, d_point_coef, d_out,
|
|
npoints, nvars, ordn);
|
|
CUDA_CHECK(cudaGetLastError());
|
|
CUDA_CHECK(cudaMemcpy(out, d_out, (size_t)total * sizeof(double),
|
|
cudaMemcpyDeviceToHost));
|
|
|
|
cudaFree(d_out);
|
|
cudaFree(d_point_coef);
|
|
cudaFree(d_point_sind);
|
|
cudaFree(d_point_dumyd);
|
|
cudaFree(d_point_dimh);
|
|
cudaFree(d_point_block);
|
|
cudaFree(d_block_shapes);
|
|
cudaFree(d_fields);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
|
int state_index,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz, cudaMemcpyHostToDevice);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
int state_index,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (!state_host_key ||
|
|
state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return 1;
|
|
copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz,
|
|
cudaMemcpyHostToDevice,
|
|
state_host_key, state_count);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz,
|
|
cudaMemcpyDeviceToHost);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz,
|
|
cudaMemcpyDeviceToHost,
|
|
state_host_key);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz,
|
|
cudaMemcpyHostToDevice);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_region_packed_batch_cuda(block_tag, state_count, host_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz,
|
|
cudaMemcpyHostToDevice,
|
|
state_host_key);
|
|
return 0;
|
|
}
|
|
|
|
static void copy_state_device_batch(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
const int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz,
|
|
int pack_not_unpack,
|
|
double **state_host_key = nullptr)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (sx <= 0 || sy <= 0 || sz <= 0) return;
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
pack_not_unpack == 0 || state_host_key != nullptr,
|
|
state_count);
|
|
double *base_mem = ctx.d_resident_mem[bank];
|
|
const int region_all = sx * sy * sz;
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
|
|
if (pack_not_unpack) {
|
|
kern_pack_state_region_batch<<<launch_grid, BLK>>>(
|
|
base_mem, device_buffer,
|
|
ex[0], ex[1], i0, j0, k0, sx, sy, sz,
|
|
region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
} else {
|
|
kern_unpack_state_region_batch<<<launch_grid, BLK>>>(
|
|
base_mem, device_buffer,
|
|
ex[0], ex[1], i0, j0, k0, sx, sy, sz,
|
|
region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, bank);
|
|
set_resident_host_clean(ctx, bank, false);
|
|
update_state_ready(ctx);
|
|
}
|
|
}
|
|
|
|
static void copy_state_device_segments(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
const int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
int pack_not_unpack,
|
|
double **state_host_key = nullptr)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (segment_count <= 0 || !segment_meta) return;
|
|
|
|
int max_region_all = 0;
|
|
for (int s = 0; s < segment_count; ++s) {
|
|
const int *m = segment_meta + s * 8;
|
|
if (m[3] <= 0 || m[4] <= 0 || m[5] <= 0 || m[6] <= 0) return;
|
|
if (m[6] > max_region_all) max_region_all = m[6];
|
|
}
|
|
if (max_region_all <= 0) return;
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
pack_not_unpack == 0 || state_host_key != nullptr,
|
|
state_count);
|
|
double *base_mem = ctx.d_resident_mem[bank];
|
|
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
|
|
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
|
|
(size_t)segment_count * 8 * sizeof(int),
|
|
cudaMemcpyHostToDevice));
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
|
|
(unsigned int)state_count,
|
|
(unsigned int)segment_count);
|
|
if (pack_not_unpack) {
|
|
kern_pack_state_segments_batch<<<launch_grid, BLK>>>(
|
|
base_mem, device_buffer,
|
|
ex[0], ex[1], d_meta, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
} else {
|
|
kern_unpack_state_segments_batch<<<launch_grid, BLK>>>(
|
|
base_mem, device_buffer,
|
|
ex[0], ex[1], d_meta, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
ctx.resident_valid[bank] = true;
|
|
ctx.resident_age[bank] = ++ctx.resident_clock;
|
|
mark_resident_current_bank(ctx, bank);
|
|
set_resident_host_clean(ctx, bank, false);
|
|
update_state_ready(ctx);
|
|
}
|
|
}
|
|
|
|
static void restrict_state_device_segments(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
const int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
double **state_host_key = nullptr,
|
|
const double *state_soa = nullptr)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (segment_count <= 0 || !segment_meta || !device_buffer) return;
|
|
|
|
int max_region_all = 0;
|
|
for (int s = 0; s < segment_count; ++s) {
|
|
const int *m = segment_meta + s * 8;
|
|
if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return;
|
|
if (m[3] > max_region_all) max_region_all = m[3];
|
|
}
|
|
if (max_region_all <= 0) return;
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
state_host_key != nullptr,
|
|
state_count);
|
|
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8);
|
|
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
|
|
(size_t)segment_count * 8 * sizeof(int),
|
|
cudaMemcpyHostToDevice));
|
|
upload_comm_state_soa(state_soa, state_count);
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
|
|
(unsigned int)state_count,
|
|
(unsigned int)segment_count);
|
|
kern_restrict_state_segments_batch<<<launch_grid, BLK>>>(
|
|
ctx.d_resident_mem[bank], device_buffer,
|
|
ex[0], ex[1], d_meta, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
}
|
|
|
|
static void prolong_state_device_segments(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
const int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
double **state_host_key = nullptr,
|
|
const double *state_soa = nullptr)
|
|
{
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return;
|
|
if (segment_count <= 0 || !segment_meta || !device_buffer) return;
|
|
|
|
int max_region_all = 0;
|
|
for (int s = 0; s < segment_count; ++s) {
|
|
const int *m = segment_meta + s * 11;
|
|
if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return;
|
|
if (m[3] > max_region_all) max_region_all = m[3];
|
|
}
|
|
if (max_region_all <= 0) return;
|
|
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all,
|
|
state_host_key != nullptr,
|
|
state_count);
|
|
int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11);
|
|
CUDA_CHECK(cudaMemcpy(d_meta, segment_meta,
|
|
(size_t)segment_count * 11 * sizeof(int),
|
|
cudaMemcpyHostToDevice));
|
|
upload_comm_state_soa(state_soa, state_count);
|
|
|
|
dim3 launch_grid((unsigned int)grid((size_t)max_region_all),
|
|
(unsigned int)state_count,
|
|
(unsigned int)segment_count);
|
|
kern_prolong_state_segments_batch<<<launch_grid, BLK>>>(
|
|
ctx.d_resident_mem[bank], device_buffer,
|
|
ex[0], ex[1], d_meta, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_batch(block_tag, state_count, device_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz, 1);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_batch(block_tag, state_count, device_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz, 1, state_host_key);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_batch(block_tag, state_count, device_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz, 0);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_batch(block_tag, state_count, device_buffer, ex,
|
|
i0, j0, k0, sx, sy, sz, 0, state_host_key);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta, 1);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta, 1, state_host_key);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta, 0);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta, 0, state_host_key);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
restrict_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
const double *state_soa)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
restrict_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta, state_host_key, state_soa);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
prolong_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
const double *state_soa)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
prolong_state_device_segments(block_tag, state_count, device_buffer, ex,
|
|
segment_count, segment_meta, state_host_key, state_soa);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int fi0, int fj0, int fk0)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
|
|
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const int region_all = sx * sy * sz;
|
|
upload_comm_state_soa(nullptr, state_count);
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
|
|
ctx.d_state_curr_mem, device_buffer,
|
|
ex[0], ex[1], sx, sy, sz,
|
|
fi0, fj0, fk0, region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int fi0, int fj0, int fk0,
|
|
const double *state_soa)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
|
|
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
|
|
const int region_all = sx * sy * sz;
|
|
upload_comm_state_soa(state_soa, state_count);
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
kern_restrict_state_region_batch<<<launch_grid, BLK>>>(
|
|
ctx.d_resident_mem[bank], device_buffer,
|
|
ex[0], ex[1], sx, sy, sz,
|
|
fi0, fj0, fk0, region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int ii0, int jj0, int kk0,
|
|
int lbc_i, int lbc_j, int lbc_k)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
|
|
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const int region_all = sx * sy * sz;
|
|
upload_comm_state_soa(nullptr, state_count);
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
|
|
ctx.d_state_curr_mem, device_buffer,
|
|
ex[0], ex[1], sx, sy, sz,
|
|
ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
|
|
region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int ii0, int jj0, int kk0,
|
|
int lbc_i, int lbc_j, int lbc_k,
|
|
const double *state_soa)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY) return 1;
|
|
if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1;
|
|
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
const int bank = active_or_keyed_bank(ctx, state_host_key, all, true, state_count);
|
|
const int region_all = sx * sy * sz;
|
|
upload_comm_state_soa(state_soa, state_count);
|
|
dim3 launch_grid((unsigned int)grid((size_t)region_all),
|
|
(unsigned int)state_count);
|
|
kern_prolong_state_region_batch<<<launch_grid, BLK>>>(
|
|
ctx.d_resident_mem[bank], device_buffer,
|
|
ex[0], ex[1], sx, sy, sz,
|
|
ii0, jj0, kk0, lbc_i, lbc_j, lbc_k,
|
|
region_all, state_count,
|
|
ex[0] * ex[1] * ex[2]);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_download_state_subset(void *block_tag,
|
|
int *ex,
|
|
int subset_count,
|
|
const int *state_indices,
|
|
double **state_host_out)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_out,
|
|
cudaMemcpyDeviceToHost);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_upload_state_subset(void *block_tag,
|
|
int *ex,
|
|
int subset_count,
|
|
const int *state_indices,
|
|
double **state_host_in)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
copy_state_subset(block_tag, ex, subset_count, state_indices, state_host_in,
|
|
cudaMemcpyHostToDevice);
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_prepare_inter_time_level(void *block_tag,
|
|
int *ex,
|
|
int state_count,
|
|
double **src1_host_key,
|
|
double **src2_host_key,
|
|
double **src3_host_key,
|
|
double **dst_host_key,
|
|
int source_count,
|
|
int tindex)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
const bool profile = cuda_aux_profile_enabled();
|
|
const double t0 = profile ? cuda_profile_now_ms() : 0.0;
|
|
if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
|
|
return 1;
|
|
if (source_count != 2 && source_count != 3) return 1;
|
|
if (!resident_key_usable_count(src1_host_key, state_count) ||
|
|
!resident_key_usable_count(src2_host_key, state_count) ||
|
|
!resident_key_usable_count(dst_host_key, state_count))
|
|
return 1;
|
|
if (source_count == 3 && !resident_key_usable_count(src3_host_key, state_count))
|
|
return 1;
|
|
|
|
double c1 = 0.0, c2 = 0.0, c3 = 0.0;
|
|
if (source_count == 2) {
|
|
if (tindex == 0) {
|
|
c1 = 0.5; c2 = 0.5;
|
|
} else if (tindex == 1) {
|
|
c1 = 0.75; c2 = 0.25;
|
|
} else if (tindex == -1) {
|
|
c1 = 0.25; c2 = 0.75;
|
|
} else {
|
|
return 1;
|
|
}
|
|
} else {
|
|
if (tindex == 0) {
|
|
c1 = 3.0 / 8.0; c2 = 3.0 / 4.0; c3 = -1.0 / 8.0;
|
|
} else if (tindex == 1 || tindex == -1) {
|
|
c1 = 5.0 / 32.0; c2 = 15.0 / 16.0; c3 = -3.0 / 32.0;
|
|
} else {
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
|
int src1_bank, src2_bank, src3_bank, dst_bank;
|
|
if (state_count == BSSN_ESCALAR_STATE_COUNT) {
|
|
src1_bank = ensure_escalar_resident_bank(ctx, src1_host_key, all, true);
|
|
src2_bank = ensure_escalar_resident_bank(ctx, src2_host_key, all, true, src1_bank);
|
|
src3_bank = (source_count == 3)
|
|
? ensure_escalar_resident_bank_avoiding(ctx, src3_host_key, all, true,
|
|
src1_bank, src2_bank, -1)
|
|
: -1;
|
|
dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all,
|
|
src1_bank, src2_bank, src3_bank);
|
|
} else if (state_count == BSSN_EM_STATE_COUNT) {
|
|
src1_bank = ensure_em_resident_bank(ctx, src1_host_key, all, true);
|
|
src2_bank = ensure_em_resident_bank(ctx, src2_host_key, all, true, src1_bank);
|
|
src3_bank = (source_count == 3)
|
|
? ensure_em_resident_bank_avoiding(ctx, src3_host_key, all, true,
|
|
src1_bank, src2_bank, -1)
|
|
: -1;
|
|
dst_bank = reserve_em_resident_output_bank_avoiding(ctx, dst_host_key, all,
|
|
src1_bank, src2_bank, src3_bank);
|
|
} else {
|
|
src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
|
|
src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
|
|
src3_bank = (source_count == 3)
|
|
? ensure_resident_bank_avoiding(ctx, src3_host_key, all, true,
|
|
src1_bank, src2_bank, -1)
|
|
: -1;
|
|
dst_bank = reserve_resident_output_bank_avoiding(ctx, dst_host_key, all,
|
|
src1_bank, src2_bank, src3_bank);
|
|
}
|
|
|
|
dim3 launch_grid((unsigned int)grid(all), (unsigned int)state_count);
|
|
kern_prepare_inter_time_level<<<launch_grid, BLK>>>(
|
|
ctx.d_resident_mem[src1_bank],
|
|
ctx.d_resident_mem[src2_bank],
|
|
(source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr,
|
|
ctx.d_resident_mem[dst_bank],
|
|
c1, c2, c3, state_count, (int)all);
|
|
if (profile)
|
|
cuda_profile_sync();
|
|
ctx.resident_valid[dst_bank] = true;
|
|
ctx.resident_age[dst_bank] = ++ctx.resident_clock;
|
|
set_resident_host_clean(ctx, dst_bank, false);
|
|
mark_resident_current_bank(ctx, dst_bank);
|
|
update_state_ready(ctx);
|
|
if (profile) {
|
|
CudaAuxProfileStats &stats = cuda_aux_profile_stats();
|
|
stats.prepare_calls++;
|
|
stats.prepare_ms += cuda_profile_now_ms() - t0;
|
|
cuda_aux_profile_maybe_log();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
extern "C"
|
|
int bssn_cuda_has_resident_state(void *block_tag)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
return has_resident_state(block_tag) ? 1 : 0;
|
|
}
|
|
|
|
extern "C"
|
|
void bssn_cuda_release_step_ctx(void *block_tag)
|
|
{
|
|
init_gpu_dispatch();
|
|
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
|
release_step_ctx(block_tag);
|
|
}
|