Pass pointer tables as kernel args and skip redundant symbol uploads
This commit is contained in:
@@ -179,6 +179,8 @@ struct GridParams {
|
||||
};
|
||||
|
||||
__constant__ GridParams d_gp;
|
||||
static GridParams g_gp_host_cache = {};
|
||||
static bool g_gp_host_cache_valid = false;
|
||||
|
||||
/* ------------------------------------------------------------------ */
|
||||
/* Device indexing helpers */
|
||||
@@ -587,6 +589,16 @@ static void release_step_ctx(void *block_tag)
|
||||
g_step_ctx.erase(it);
|
||||
}
|
||||
|
||||
static void upload_grid_params_if_needed(const GridParams &gp)
|
||||
{
|
||||
if (!g_gp_host_cache_valid ||
|
||||
std::memcmp(&g_gp_host_cache, &gp, sizeof(GridParams)) != 0) {
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
||||
g_gp_host_cache = gp;
|
||||
g_gp_host_cache_valid = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* ================================================================== */
|
||||
/* A. Symmetry boundary kernels (ord=2 and ord=3) */
|
||||
/* ================================================================== */
|
||||
@@ -1191,14 +1203,18 @@ void kern_kodis(const double * __restrict__ fh,
|
||||
/* ================================================================== */
|
||||
/* Host wrapper helpers */
|
||||
/* ================================================================== */
|
||||
__constant__ const double *d_lk_adv_fields[BSSN_LK_FIELD_COUNT];
|
||||
__constant__ const double *d_lk_ko_fields[BSSN_LK_FIELD_COUNT];
|
||||
__constant__ double *d_lk_rhs_fields[BSSN_LK_FIELD_COUNT];
|
||||
__constant__ int d_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT];
|
||||
struct LopsidedKodisTables {
|
||||
const double *adv_fields[BSSN_LK_FIELD_COUNT];
|
||||
const double *ko_fields[BSSN_LK_FIELD_COUNT];
|
||||
double *rhs_fields[BSSN_LK_FIELD_COUNT];
|
||||
int soa_signs[3 * BSSN_LK_FIELD_COUNT];
|
||||
};
|
||||
|
||||
__constant__ const double *d_rk4_f0_fields[BSSN_STATE_COUNT];
|
||||
__constant__ double *d_rk4_rhs_fields[BSSN_STATE_COUNT];
|
||||
__constant__ double *d_rk4_accum_fields[BSSN_STATE_COUNT];
|
||||
struct Rk4FinalizeTables {
|
||||
const double *f0_fields[BSSN_STATE_COUNT];
|
||||
double *rhs_fields[BSSN_STATE_COUNT];
|
||||
double *accum_fields[BSSN_STATE_COUNT];
|
||||
};
|
||||
|
||||
static const int BLK = 128;
|
||||
static inline int grid(size_t n) {
|
||||
@@ -1268,6 +1284,7 @@ __global__ __launch_bounds__(128, 4)
|
||||
void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
||||
const double * __restrict__ Sfy,
|
||||
const double * __restrict__ Sfz,
|
||||
LopsidedKodisTables tables,
|
||||
double eps_val)
|
||||
{
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
@@ -1277,12 +1294,12 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int iminF = d_gp.iminF3, jminF = d_gp.jminF3, kminF = d_gp.kminF3;
|
||||
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||
const int SoA0 = d_lk_soa_signs[3 * field + 0];
|
||||
const int SoA1 = d_lk_soa_signs[3 * field + 1];
|
||||
const int SoA2 = d_lk_soa_signs[3 * field + 2];
|
||||
const double *adv_src = d_lk_adv_fields[field];
|
||||
const double *ko_src = d_lk_ko_fields[field];
|
||||
double *rhs = d_lk_rhs_fields[field];
|
||||
const int SoA0 = tables.soa_signs[3 * field + 0];
|
||||
const int SoA1 = tables.soa_signs[3 * field + 1];
|
||||
const int SoA2 = tables.soa_signs[3 * field + 2];
|
||||
const double *adv_src = tables.adv_fields[field];
|
||||
const double *ko_src = tables.ko_fields[field];
|
||||
double *rhs = tables.rhs_fields[field];
|
||||
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
@@ -1477,24 +1494,17 @@ void kern_lopsided_kodis_batched(const double * __restrict__ Sfx,
|
||||
|
||||
static void gpu_lopsided_kodis_state_batch(double eps_val, int all)
|
||||
{
|
||||
const double *adv_fields[BSSN_LK_FIELD_COUNT];
|
||||
const double *ko_fields[BSSN_LK_FIELD_COUNT];
|
||||
double *rhs_fields[BSSN_LK_FIELD_COUNT];
|
||||
|
||||
LopsidedKodisTables tables = {};
|
||||
for (int i = 0; i < BSSN_LK_FIELD_COUNT; ++i) {
|
||||
adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]];
|
||||
ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]];
|
||||
rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]];
|
||||
tables.adv_fields[i] = g_buf.slot[k_lk_adv_slots[i]];
|
||||
tables.ko_fields[i] = g_buf.slot[k_lk_ko_slots[i]];
|
||||
tables.rhs_fields[i] = g_buf.slot[k_lk_rhs_slots[i]];
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_lk_adv_fields, adv_fields, sizeof(adv_fields)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_lk_ko_fields, ko_fields, sizeof(ko_fields)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_lk_rhs_fields, rhs_fields, sizeof(rhs_fields)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_lk_soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs)));
|
||||
std::memcpy(tables.soa_signs, k_lk_soa_signs, sizeof(k_lk_soa_signs));
|
||||
|
||||
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_LK_FIELD_COUNT);
|
||||
kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
|
||||
g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], eps_val);
|
||||
g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
|
||||
}
|
||||
|
||||
__global__ void kern_rk4_finalize(const double * __restrict__ f0,
|
||||
@@ -1529,15 +1539,18 @@ __global__ void kern_rk4_finalize(const double * __restrict__ f0,
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_rk4_finalize_batched(double dT, int rk4_stage, double chitiny)
|
||||
void kern_rk4_finalize_batched(Rk4FinalizeTables tables,
|
||||
double dT,
|
||||
int rk4_stage,
|
||||
double chitiny)
|
||||
{
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int field = blockIdx.y;
|
||||
const double *f0 = d_rk4_f0_fields[field];
|
||||
double *frhs = d_rk4_rhs_fields[field];
|
||||
double *accum = d_rk4_accum_fields[field];
|
||||
const double *f0 = tables.f0_fields[field];
|
||||
double *frhs = tables.rhs_fields[field];
|
||||
double *accum = tables.accum_fields[field];
|
||||
|
||||
const double rhs = frhs[tid];
|
||||
switch (rk4_stage) {
|
||||
@@ -1567,22 +1580,15 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx,
|
||||
int rk4_stage,
|
||||
double chitiny)
|
||||
{
|
||||
const double *f0_fields[BSSN_STATE_COUNT];
|
||||
double *rhs_fields[BSSN_STATE_COUNT];
|
||||
double *accum_fields[BSSN_STATE_COUNT];
|
||||
|
||||
Rk4FinalizeTables tables = {};
|
||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||
f0_fields[i] = ctx.d_state0[i];
|
||||
rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
||||
accum_fields[i] = ctx.d_accum[i];
|
||||
tables.f0_fields[i] = ctx.d_state0[i];
|
||||
tables.rhs_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
||||
tables.accum_fields[i] = ctx.d_accum[i];
|
||||
}
|
||||
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_f0_fields, f0_fields, sizeof(f0_fields)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_rhs_fields, rhs_fields, sizeof(rhs_fields)));
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_rk4_accum_fields, accum_fields, sizeof(accum_fields)));
|
||||
|
||||
dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_STATE_COUNT);
|
||||
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(dT, rk4_stage, chitiny);
|
||||
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
||||
}
|
||||
|
||||
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
||||
@@ -2999,7 +3005,7 @@ static void setup_grid_params(int *ex,
|
||||
gp.fh3_nx = nx + 3;
|
||||
gp.fh3_ny = ny + 3;
|
||||
gp.fh3_nz = nz + 3;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
||||
upload_grid_params_if_needed(gp);
|
||||
}
|
||||
|
||||
static void upload_state_inputs(double **state_host, size_t all)
|
||||
@@ -3480,36 +3486,9 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
|
||||
const int nx = ex[0], ny = ex[1], nz = ex[2];
|
||||
const int all = nx * ny * nz;
|
||||
const double dX = X[1]-X[0], dY = Y[1]-Y[0], dZ = Z[1]-Z[0];
|
||||
const int NO_SYMM = 0, EQ_SYMM = 1;
|
||||
const double SYM = 1.0, ANTI = -1.0;
|
||||
|
||||
/* --- Allocate GPU buffers --- */
|
||||
ensure_gpu_buffers(nx, ny, nz);
|
||||
|
||||
/* --- Setup GridParams --- */
|
||||
GridParams gp;
|
||||
gp.ex[0]=nx; gp.ex[1]=ny; gp.ex[2]=nz;
|
||||
gp.all=all; gp.dX=dX; gp.dY=dY; gp.dZ=dZ;
|
||||
gp.d12dx=1.0/(12.0*dX); gp.d12dy=1.0/(12.0*dY); gp.d12dz=1.0/(12.0*dZ);
|
||||
gp.d2dx=1.0/(2.0*dX); gp.d2dy=1.0/(2.0*dY); gp.d2dz=1.0/(2.0*dZ);
|
||||
gp.Fdxdx=1.0/(12.0*dX*dX); gp.Fdydy=1.0/(12.0*dY*dY); gp.Fdzdz=1.0/(12.0*dZ*dZ);
|
||||
gp.Sdxdx=1.0/(dX*dX); gp.Sdydy=1.0/(dY*dY); gp.Sdzdz=1.0/(dZ*dZ);
|
||||
gp.Fdxdy=1.0/(144.0*dX*dY); gp.Fdxdz=1.0/(144.0*dX*dZ); gp.Fdydz=1.0/(144.0*dY*dZ);
|
||||
gp.Sdxdy=0.25/(dX*dY); gp.Sdxdz=0.25/(dX*dZ); gp.Sdydz=0.25/(dY*dZ);
|
||||
gp.imaxF=nx; gp.jmaxF=ny; gp.kmaxF=nz;
|
||||
gp.iminF=1; gp.jminF=1; gp.kminF=1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF = -1;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF = -1;
|
||||
gp.iminF3=1; gp.jminF3=1; gp.kminF3=1;
|
||||
if (Symmetry > NO_SYMM && fabs(Z[0]) < dZ) gp.kminF3 = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(X[0]) < dX) gp.iminF3 = -2;
|
||||
if (Symmetry > EQ_SYMM && fabs(Y[0]) < dY) gp.jminF3 = -2;
|
||||
gp.Symmetry=Symmetry; gp.eps=eps; gp.co=co;
|
||||
gp.fh2_nx=nx+2; gp.fh2_ny=ny+2; gp.fh2_nz=nz+2;
|
||||
gp.fh3_nx=nx+3; gp.fh3_ny=ny+3; gp.fh3_nz=nz+3;
|
||||
CUDA_CHECK(cudaMemcpyToSymbol(d_gp, &gp, sizeof(GridParams)));
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||
|
||||
/* --- Shorthand for device slot pointers --- */
|
||||
#define D(s) g_buf.slot[s]
|
||||
|
||||
Reference in New Issue
Block a user