Batch patch-boundary copy and gate CPU BC in GPU substeps
This commit is contained in:
@@ -95,7 +95,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
|||||||
while (BP)
|
while (BP)
|
||||||
{
|
{
|
||||||
Block *cg = BP->data;
|
Block *cg = BP->data;
|
||||||
if (myrank == cg->rank)
|
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||||
{
|
{
|
||||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||||
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||||
@@ -3301,18 +3301,21 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
cg->fgfs[varlrhs->data->sgfn],
|
cg->fgfs[varlrhs->data->sgfn],
|
||||||
iter_count);
|
iter_count);
|
||||||
#endif
|
#endif
|
||||||
|
if (!used_gpu_substep)
|
||||||
|
{
|
||||||
#ifndef WithShell
|
#ifndef WithShell
|
||||||
if (lev > 0) // fix BD point
|
if (lev > 0) // fix BD point
|
||||||
#endif
|
#endif
|
||||||
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||||
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
|
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
|
||||||
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
|
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
|
||||||
dT_lev,
|
dT_lev,
|
||||||
cg->fgfs[phi0->sgfn],
|
cg->fgfs[phi0->sgfn],
|
||||||
cg->fgfs[Lap0->sgfn],
|
cg->fgfs[Lap0->sgfn],
|
||||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
|
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
|
||||||
varl0->data->SoA,
|
varl0->data->SoA,
|
||||||
Symmetry, cor);
|
Symmetry, cor);
|
||||||
|
}
|
||||||
|
|
||||||
#if (SommerType == 1)
|
#if (SommerType == 1)
|
||||||
#warning "shell part still bam type"
|
#warning "shell part still bam type"
|
||||||
@@ -3333,7 +3336,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
varlrhs = varlrhs->next;
|
varlrhs = varlrhs->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!used_gpu_resident_state)
|
if (!used_gpu_substep)
|
||||||
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
|
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
|
||||||
}
|
}
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
@@ -3722,18 +3725,21 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
iter_count);
|
iter_count);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
if (!used_gpu_substep)
|
||||||
|
{
|
||||||
#ifndef WithShell
|
#ifndef WithShell
|
||||||
if (lev > 0) // fix BD point
|
if (lev > 0) // fix BD point
|
||||||
#endif
|
#endif
|
||||||
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
f_sommerfeld_rout(cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||||
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
|
Pp->data->bbox[0], Pp->data->bbox[1], Pp->data->bbox[2],
|
||||||
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
|
Pp->data->bbox[3], Pp->data->bbox[4], Pp->data->bbox[5],
|
||||||
dT_lev,
|
dT_lev,
|
||||||
cg->fgfs[phi0->sgfn],
|
cg->fgfs[phi0->sgfn],
|
||||||
cg->fgfs[Lap0->sgfn],
|
cg->fgfs[Lap0->sgfn],
|
||||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
|
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
|
||||||
varl0->data->SoA,
|
varl0->data->SoA,
|
||||||
Symmetry, cor);
|
Symmetry, cor);
|
||||||
|
}
|
||||||
|
|
||||||
#if (SommerType == 1)
|
#if (SommerType == 1)
|
||||||
if (lev == 1) // shibata type sommerfeld
|
if (lev == 1) // shibata type sommerfeld
|
||||||
@@ -3754,7 +3760,7 @@ void bssn_class::Step(int lev, int YN)
|
|||||||
varlrhs = varlrhs->next;
|
varlrhs = varlrhs->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!used_gpu_resident_state)
|
if (!used_gpu_substep)
|
||||||
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
|
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
|
||||||
}
|
}
|
||||||
if (BP == Pp->data->ble)
|
if (BP == Pp->data->ble)
|
||||||
@@ -8636,4 +8642,3 @@ bool bssn_class::check_Stdin_Abort()
|
|||||||
}
|
}
|
||||||
|
|
||||||
//================================================================================================
|
//================================================================================================
|
||||||
|
|
||||||
|
|||||||
@@ -1216,6 +1216,11 @@ struct Rk4FinalizeTables {
|
|||||||
double *accum_fields[BSSN_STATE_COUNT];
|
double *accum_fields[BSSN_STATE_COUNT];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct PatchBoundaryTables {
|
||||||
|
const double *src_fields[BSSN_STATE_COUNT];
|
||||||
|
double *dst_fields[BSSN_STATE_COUNT];
|
||||||
|
};
|
||||||
|
|
||||||
static const int BLK = 128;
|
static const int BLK = 128;
|
||||||
static inline int grid(size_t n) {
|
static inline int grid(size_t n) {
|
||||||
if (n == 0) return 1;
|
if (n == 0) return 1;
|
||||||
@@ -1591,6 +1596,56 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx,
|
|||||||
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
__global__ __launch_bounds__(128, 4)
|
||||||
|
void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
|
||||||
|
int touch_xmin, int touch_xmax,
|
||||||
|
int touch_ymin, int touch_ymax,
|
||||||
|
int touch_zmin, int touch_zmax)
|
||||||
|
{
|
||||||
|
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
if (tid >= d_gp.all) return;
|
||||||
|
|
||||||
|
const int nx = d_gp.ex[0];
|
||||||
|
const int ny = d_gp.ex[1];
|
||||||
|
const int i0 = tid % nx;
|
||||||
|
const int j0 = (tid / nx) % ny;
|
||||||
|
const int k0 = tid / (nx * ny);
|
||||||
|
|
||||||
|
const bool on_boundary =
|
||||||
|
(touch_xmin && i0 == 0) ||
|
||||||
|
(touch_xmax && i0 == nx - 1) ||
|
||||||
|
(touch_ymin && j0 == 0) ||
|
||||||
|
(touch_ymax && j0 == ny - 1) ||
|
||||||
|
(touch_zmin && k0 == 0) ||
|
||||||
|
(touch_zmax && k0 == d_gp.ex[2] - 1);
|
||||||
|
if (!on_boundary) return;
|
||||||
|
|
||||||
|
const int field = blockIdx.y;
|
||||||
|
tables.dst_fields[field][tid] = tables.src_fields[field][tid];
|
||||||
|
}
|
||||||
|
|
||||||
|
static void gpu_copy_patch_boundary_batch(int all,
|
||||||
|
int touch_xmin, int touch_xmax,
|
||||||
|
int touch_ymin, int touch_ymax,
|
||||||
|
int touch_zmin, int touch_zmax)
|
||||||
|
{
|
||||||
|
if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax))
|
||||||
|
return;
|
||||||
|
|
||||||
|
PatchBoundaryTables tables = {};
|
||||||
|
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||||
|
tables.src_fields[i] = g_buf.slot[k_state_input_slots[i]];
|
||||||
|
tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
||||||
|
}
|
||||||
|
|
||||||
|
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT);
|
||||||
|
kern_copy_patch_boundary_batched<<<launch_grid, BLK>>>(
|
||||||
|
tables,
|
||||||
|
touch_xmin, touch_xmax,
|
||||||
|
touch_ymin, touch_ymax,
|
||||||
|
touch_zmin, touch_zmax);
|
||||||
|
}
|
||||||
|
|
||||||
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
||||||
double * __restrict__ gxy,
|
double * __restrict__ gxy,
|
||||||
double * __restrict__ gxz,
|
double * __restrict__ gxz,
|
||||||
@@ -3008,6 +3063,32 @@ static void setup_grid_params(int *ex,
|
|||||||
upload_grid_params_if_needed(gp);
|
upload_grid_params_if_needed(gp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void compute_patch_boundary_flags(int *ex,
|
||||||
|
double *X, double *Y, double *Z,
|
||||||
|
const double *bbox,
|
||||||
|
int Symmetry,
|
||||||
|
int &touch_xmin, int &touch_xmax,
|
||||||
|
int &touch_ymin, int &touch_ymax,
|
||||||
|
int &touch_zmin, int &touch_zmax)
|
||||||
|
{
|
||||||
|
const double dX = X[1] - X[0];
|
||||||
|
const double dY = Y[1] - Y[0];
|
||||||
|
const double dZ = Z[1] - Z[0];
|
||||||
|
const int NO_SYMM = 0;
|
||||||
|
const int OCTANT = 2;
|
||||||
|
|
||||||
|
touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0;
|
||||||
|
touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0;
|
||||||
|
touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0;
|
||||||
|
|
||||||
|
touch_xmin = (std::fabs(X[0] - bbox[0]) < dX &&
|
||||||
|
!(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0;
|
||||||
|
touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY &&
|
||||||
|
!(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0;
|
||||||
|
touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ &&
|
||||||
|
!(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0;
|
||||||
|
}
|
||||||
|
|
||||||
static void upload_state_inputs(double **state_host, size_t all)
|
static void upload_state_inputs(double **state_host, size_t all)
|
||||||
{
|
{
|
||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
@@ -3922,8 +4003,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
|
|
||||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||||
const size_t bytes = all * sizeof(double);
|
const size_t bytes = all * sizeof(double);
|
||||||
|
int touch_xmin = 0, touch_xmax = 0;
|
||||||
|
int touch_ymin = 0, touch_ymax = 0;
|
||||||
|
int touch_zmin = 0, touch_zmax = 0;
|
||||||
|
|
||||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||||
|
if (Lev > 0) {
|
||||||
|
compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
|
||||||
|
touch_xmin, touch_xmax,
|
||||||
|
touch_ymin, touch_ymax,
|
||||||
|
touch_zmin, touch_zmax);
|
||||||
|
}
|
||||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||||
const bool use_resident_state = (keep_resident_state != 0);
|
const bool use_resident_state = (keep_resident_state != 0);
|
||||||
if (use_resident_state) {
|
if (use_resident_state) {
|
||||||
@@ -3986,6 +4076,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
|||||||
|
|
||||||
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||||
gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
||||||
|
if (Lev > 0) {
|
||||||
|
gpu_copy_patch_boundary_batch((int)all,
|
||||||
|
touch_xmin, touch_xmax,
|
||||||
|
touch_ymin, touch_ymax,
|
||||||
|
touch_zmin, touch_zmax);
|
||||||
|
}
|
||||||
if (profile) {
|
if (profile) {
|
||||||
cuda_profile_sync();
|
cuda_profile_sync();
|
||||||
finalize_ms += cuda_profile_now_ms() - t0;
|
finalize_ms += cuda_profile_now_ms() - t0;
|
||||||
|
|||||||
Reference in New Issue
Block a user