Add resident BSSN GPU point interpolation
This commit is contained in:
@@ -439,38 +439,56 @@ bool bssn_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||
if (bssn_cuda_has_resident_state(block) &&
|
||||
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
||||
{
|
||||
const int sx = ordn;
|
||||
const int sy = ordn;
|
||||
const int sz = ordn;
|
||||
const int region_all = sx * sy * sz;
|
||||
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
|
||||
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
|
||||
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
|
||||
double packed_fields[3 * region_all];
|
||||
var *vars[3] = {forx, fory, forz};
|
||||
double soa3[9];
|
||||
for (int f = 0; f < 3; f++)
|
||||
{
|
||||
if (bssn_cuda_pack_state_region_to_host_buffer(block,
|
||||
k_bssn_cuda_bh_state_indices[f],
|
||||
packed_fields + f * region_all,
|
||||
block->shape,
|
||||
i0, j0, k0,
|
||||
sx, sy, sz) != 0)
|
||||
soa3[3 * f + 0] = vars[f]->SoA[0];
|
||||
soa3[3 * f + 1] = vars[f]->SoA[1];
|
||||
soa3[3 * f + 2] = vars[f]->SoA[2];
|
||||
}
|
||||
if (bssn_cuda_interp_state_point3(block, block->shape,
|
||||
k_bssn_cuda_bh_state_indices[0],
|
||||
k_bssn_cuda_bh_state_indices[1],
|
||||
k_bssn_cuda_bh_state_indices[2],
|
||||
block->X[0][0], block->X[1][0], block->X[2][0],
|
||||
DH[0], DH[1], DH[2],
|
||||
x, y, z,
|
||||
interp_ordn, interp_sym,
|
||||
soa3, shellf) != 0)
|
||||
{
|
||||
const int sx = ordn;
|
||||
const int sy = ordn;
|
||||
const int sz = ordn;
|
||||
const int region_all = sx * sy * sz;
|
||||
const int i0 = bssn_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
|
||||
const int j0 = bssn_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
|
||||
const int k0 = bssn_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
|
||||
double packed_fields[3 * region_all];
|
||||
for (int f = 0; f < 3; f++)
|
||||
{
|
||||
cout << "CUDA BH tile download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
if (bssn_cuda_pack_state_region_to_host_buffer(block,
|
||||
k_bssn_cuda_bh_state_indices[f],
|
||||
packed_fields + f * region_all,
|
||||
block->shape,
|
||||
i0, j0, k0,
|
||||
sx, sy, sz) != 0)
|
||||
{
|
||||
cout << "CUDA BH tile download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int tile_shape[3] = {sx, sy, sz};
|
||||
f_global_interp(tile_shape,
|
||||
block->X[0] + i0,
|
||||
block->X[1] + j0,
|
||||
block->X[2] + k0,
|
||||
packed_fields + f * region_all,
|
||||
shellf[f],
|
||||
x, y, z,
|
||||
interp_ordn,
|
||||
vars[f]->SoA,
|
||||
interp_sym);
|
||||
}
|
||||
int tile_shape[3] = {sx, sy, sz};
|
||||
f_global_interp(tile_shape,
|
||||
block->X[0] + i0,
|
||||
block->X[1] + j0,
|
||||
block->X[2] + k0,
|
||||
packed_fields + f * region_all,
|
||||
shellf[f],
|
||||
x, y, z,
|
||||
interp_ordn,
|
||||
vars[f]->SoA,
|
||||
interp_sym);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -3692,13 +3710,16 @@ void bssn_class::Step(int lev, int YN)
|
||||
}
|
||||
}
|
||||
|
||||
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
|
||||
|
||||
// data analysis part
|
||||
// Warning NOTE: the variables1 are used as temp storege room
|
||||
if (lev == a_lev)
|
||||
{
|
||||
STEP_TIMER_DECL(timer_analysis_surface);
|
||||
AnalysisStuff(lev, dT_lev);
|
||||
STEP_TIMER_ADD(TB_ANALYSIS_SURFACE, timer_analysis_surface);
|
||||
}
|
||||
STEP_TIMER_ADD(TB_BH_PREDICTOR, timer_bh_predictor);
|
||||
#endif
|
||||
|
||||
#ifdef With_AHF
|
||||
|
||||
@@ -5618,6 +5618,153 @@ __global__ void kern_prepare_inter_time_level(const double * __restrict__ src1,
|
||||
}
|
||||
}
|
||||
|
||||
__device__ double interp_lagrange_weight(int idx, double x, int ordn)
|
||||
{
|
||||
double w = 1.0;
|
||||
const double xi = (double)idx;
|
||||
for (int j = 0; j < ordn; ++j) {
|
||||
if (j == idx) continue;
|
||||
w *= (x - (double)j) / (xi - (double)j);
|
||||
}
|
||||
return w;
|
||||
}
|
||||
|
||||
__device__ void interp_axis_window(double p,
|
||||
double x0,
|
||||
double dx,
|
||||
int n,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
int axis,
|
||||
int &base,
|
||||
double &local_x)
|
||||
{
|
||||
int cx_i = (int)((p - x0) / dx + 0.4) + 1;
|
||||
int cx_b = cx_i - ordn / 2 + 1;
|
||||
int cx_t = cx_b + ordn - 1;
|
||||
int cmin = 1;
|
||||
if (symmetry == 2 && axis < 2 && fabs(x0) < dx)
|
||||
cmin = -ordn / 2 + 1;
|
||||
if (symmetry != 0 && axis == 2 && fabs(x0) < dx)
|
||||
cmin = -ordn / 2 + 1;
|
||||
|
||||
if (cx_b < cmin) {
|
||||
cx_b = cmin;
|
||||
cx_t = cx_b + ordn - 1;
|
||||
}
|
||||
if (cx_t > n) {
|
||||
cx_t = n;
|
||||
cx_b = cx_t + 1 - ordn;
|
||||
}
|
||||
|
||||
base = cx_b;
|
||||
if (cx_b > 0) {
|
||||
const double xb = x0 + (double)(cx_b - 1) * dx;
|
||||
local_x = (p - xb) / dx;
|
||||
} else {
|
||||
const int reflected = 1 - cx_b;
|
||||
const double xb = x0 + (double)(reflected - 1) * dx;
|
||||
local_x = (p + xb) / dx;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ double load_interp_value(const double * __restrict__ mem,
|
||||
int nx,
|
||||
int ny,
|
||||
int nz,
|
||||
int all,
|
||||
int state,
|
||||
int fi,
|
||||
int fj,
|
||||
int fk,
|
||||
const double * __restrict__ soa)
|
||||
{
|
||||
double sign = 1.0;
|
||||
int ii = fi;
|
||||
int jj = fj;
|
||||
int kk = fk;
|
||||
if (ii <= 0) {
|
||||
ii = 1 - ii;
|
||||
sign *= soa[0];
|
||||
}
|
||||
if (jj <= 0) {
|
||||
jj = 1 - jj;
|
||||
sign *= soa[1];
|
||||
}
|
||||
if (kk <= 0) {
|
||||
kk = 1 - kk;
|
||||
sign *= soa[2];
|
||||
}
|
||||
if (ii < 1 || ii > nx || jj < 1 || jj > ny || kk < 1 || kk > nz)
|
||||
return 0.0;
|
||||
const int idx = (ii - 1) + (jj - 1) * nx + (kk - 1) * nx * ny;
|
||||
return sign * mem[(size_t)state * (size_t)all + (size_t)idx];
|
||||
}
|
||||
|
||||
__global__ void kern_interp_state_point3(const double * __restrict__ mem,
|
||||
double * __restrict__ out,
|
||||
int nx,
|
||||
int ny,
|
||||
int nz,
|
||||
int all,
|
||||
int state0,
|
||||
int state1,
|
||||
int state2,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
double px,
|
||||
double py,
|
||||
double pz,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double soa00, double soa01, double soa02,
|
||||
double soa10, double soa11, double soa12,
|
||||
double soa20, double soa21, double soa22)
|
||||
{
|
||||
const int f = threadIdx.x;
|
||||
if (f >= 3 || ordn <= 0 || ordn > 8)
|
||||
return;
|
||||
|
||||
const int states[3] = {state0, state1, state2};
|
||||
const double soa_all[9] = {
|
||||
soa00, soa01, soa02,
|
||||
soa10, soa11, soa12,
|
||||
soa20, soa21, soa22
|
||||
};
|
||||
const double *soa = soa_all + 3 * f;
|
||||
|
||||
int ib, jb, kb;
|
||||
double tx, ty, tz;
|
||||
interp_axis_window(px, x0, dx, nx, ordn, symmetry, 0, ib, tx);
|
||||
interp_axis_window(py, y0, dy, ny, ordn, symmetry, 1, jb, ty);
|
||||
interp_axis_window(pz, z0, dz, nz, ordn, symmetry, 2, kb, tz);
|
||||
|
||||
double wx[8], wy[8], wz[8];
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
wx[i] = interp_lagrange_weight(i, tx, ordn);
|
||||
wy[i] = interp_lagrange_weight(i, ty, ordn);
|
||||
wz[i] = interp_lagrange_weight(i, tz, ordn);
|
||||
}
|
||||
|
||||
double value = 0.0;
|
||||
for (int k = 0; k < ordn; ++k) {
|
||||
for (int j = 0; j < ordn; ++j) {
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
const double coeff = wx[i] * wy[j] * wz[k];
|
||||
value += coeff * load_interp_value(mem, nx, ny, nz, all,
|
||||
states[f],
|
||||
ib + i, jb + j, kb + k,
|
||||
soa);
|
||||
}
|
||||
}
|
||||
}
|
||||
out[f] = value;
|
||||
}
|
||||
|
||||
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
||||
double * __restrict__ dst,
|
||||
int nx, int ny,
|
||||
@@ -6876,6 +7023,59 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_interp_state_point3(void *block_tag,
|
||||
int *ex,
|
||||
int state0,
|
||||
int state1,
|
||||
int state2,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
double px,
|
||||
double py,
|
||||
double pz,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
const double *soa3,
|
||||
double *out3)
|
||||
{
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
if (!block_tag || !ex || !out3 || !soa3)
|
||||
return 1;
|
||||
if (state0 < 0 || state0 >= BSSN_STATE_COUNT ||
|
||||
state1 < 0 || state1 >= BSSN_STATE_COUNT ||
|
||||
state2 < 0 || state2 >= BSSN_STATE_COUNT)
|
||||
return 1;
|
||||
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
|
||||
ordn <= 0 || ordn > 8 ||
|
||||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
||||
return 1;
|
||||
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]);
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const int bank = active_or_keyed_bank(ctx, nullptr, all, false);
|
||||
if (bank < 0 || !ctx.resident_valid[bank])
|
||||
return 1;
|
||||
|
||||
double *d_out = ensure_step_comm_buffer(ctx, 3);
|
||||
kern_interp_state_point3<<<1, 3>>>(
|
||||
ctx.d_resident_mem[bank], d_out,
|
||||
ex[0], ex[1], ex[2], (int)all,
|
||||
state0, state1, state2,
|
||||
x0, y0, z0, dx, dy, dz,
|
||||
px, py, pz, ordn, symmetry,
|
||||
soa3[0], soa3[1], soa3[2],
|
||||
soa3[3], soa3[4], soa3[5],
|
||||
soa3[6], soa3[7], soa3[8]);
|
||||
CUDA_CHECK(cudaMemcpy(out3, d_out, 3 * sizeof(double), cudaMemcpyDeviceToHost));
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
|
||||
@@ -83,6 +83,25 @@ int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_interp_state_point3(void *block_tag,
|
||||
int *ex,
|
||||
int state0,
|
||||
int state1,
|
||||
int state2,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
double px,
|
||||
double py,
|
||||
double pz,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
const double *soa3,
|
||||
double *out3);
|
||||
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
double *host_buffer,
|
||||
|
||||
Reference in New Issue
Block a user