Batch patch-boundary copy and gate CPU BC in GPU substeps
This commit is contained in:
@@ -95,7 +95,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[BSSN_CUDA_STATE_COUNT];
|
||||
if (!fill_bssn_cuda_views(cg, vars, state_out))
|
||||
@@ -3301,6 +3301,8 @@ void bssn_class::Step(int lev, int YN)
|
||||
cg->fgfs[varlrhs->data->sgfn],
|
||||
iter_count);
|
||||
#endif
|
||||
if (!used_gpu_substep)
|
||||
{
|
||||
#ifndef WithShell
|
||||
if (lev > 0) // fix BD point
|
||||
#endif
|
||||
@@ -3313,6 +3315,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl->data->sgfn],
|
||||
varl0->data->SoA,
|
||||
Symmetry, cor);
|
||||
}
|
||||
|
||||
#if (SommerType == 1)
|
||||
#warning "shell part still bam type"
|
||||
@@ -3333,7 +3336,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
varlrhs = varlrhs->next;
|
||||
}
|
||||
}
|
||||
if (!used_gpu_resident_state)
|
||||
if (!used_gpu_substep)
|
||||
f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
@@ -3722,6 +3725,8 @@ void bssn_class::Step(int lev, int YN)
|
||||
iter_count);
|
||||
#endif
|
||||
|
||||
if (!used_gpu_substep)
|
||||
{
|
||||
#ifndef WithShell
|
||||
if (lev > 0) // fix BD point
|
||||
#endif
|
||||
@@ -3734,6 +3739,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
cg->fgfs[varl0->data->sgfn], cg->fgfs[varl1->data->sgfn],
|
||||
varl0->data->SoA,
|
||||
Symmetry, cor);
|
||||
}
|
||||
|
||||
#if (SommerType == 1)
|
||||
if (lev == 1) // shibata type sommerfeld
|
||||
@@ -3754,7 +3760,7 @@ void bssn_class::Step(int lev, int YN)
|
||||
varlrhs = varlrhs->next;
|
||||
}
|
||||
}
|
||||
if (!used_gpu_resident_state)
|
||||
if (!used_gpu_substep)
|
||||
f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
@@ -8636,4 +8642,3 @@ bool bssn_class::check_Stdin_Abort()
|
||||
}
|
||||
|
||||
//================================================================================================
|
||||
|
||||
|
||||
@@ -1216,6 +1216,11 @@ struct Rk4FinalizeTables {
|
||||
double *accum_fields[BSSN_STATE_COUNT];
|
||||
};
|
||||
|
||||
struct PatchBoundaryTables {
|
||||
const double *src_fields[BSSN_STATE_COUNT];
|
||||
double *dst_fields[BSSN_STATE_COUNT];
|
||||
};
|
||||
|
||||
static const int BLK = 128;
|
||||
static inline int grid(size_t n) {
|
||||
if (n == 0) return 1;
|
||||
@@ -1591,6 +1596,56 @@ static void gpu_rk4_finalize_batch(const StepContext &ctx,
|
||||
kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
|
||||
int touch_xmin, int touch_xmax,
|
||||
int touch_ymin, int touch_ymax,
|
||||
int touch_zmin, int touch_zmax)
|
||||
{
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int nx = d_gp.ex[0];
|
||||
const int ny = d_gp.ex[1];
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
const int k0 = tid / (nx * ny);
|
||||
|
||||
const bool on_boundary =
|
||||
(touch_xmin && i0 == 0) ||
|
||||
(touch_xmax && i0 == nx - 1) ||
|
||||
(touch_ymin && j0 == 0) ||
|
||||
(touch_ymax && j0 == ny - 1) ||
|
||||
(touch_zmin && k0 == 0) ||
|
||||
(touch_zmax && k0 == d_gp.ex[2] - 1);
|
||||
if (!on_boundary) return;
|
||||
|
||||
const int field = blockIdx.y;
|
||||
tables.dst_fields[field][tid] = tables.src_fields[field][tid];
|
||||
}
|
||||
|
||||
static void gpu_copy_patch_boundary_batch(int all,
|
||||
int touch_xmin, int touch_xmax,
|
||||
int touch_ymin, int touch_ymax,
|
||||
int touch_zmin, int touch_zmax)
|
||||
{
|
||||
if (!(touch_xmin || touch_xmax || touch_ymin || touch_ymax || touch_zmin || touch_zmax))
|
||||
return;
|
||||
|
||||
PatchBoundaryTables tables = {};
|
||||
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
|
||||
tables.src_fields[i] = g_buf.slot[k_state_input_slots[i]];
|
||||
tables.dst_fields[i] = g_buf.slot[k_state_rhs_slots[i]];
|
||||
}
|
||||
|
||||
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_STATE_COUNT);
|
||||
kern_copy_patch_boundary_batched<<<launch_grid, BLK>>>(
|
||||
tables,
|
||||
touch_xmin, touch_xmax,
|
||||
touch_ymin, touch_ymax,
|
||||
touch_zmin, touch_zmax);
|
||||
}
|
||||
|
||||
__global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
|
||||
double * __restrict__ gxy,
|
||||
double * __restrict__ gxz,
|
||||
@@ -3008,6 +3063,32 @@ static void setup_grid_params(int *ex,
|
||||
upload_grid_params_if_needed(gp);
|
||||
}
|
||||
|
||||
static void compute_patch_boundary_flags(int *ex,
|
||||
double *X, double *Y, double *Z,
|
||||
const double *bbox,
|
||||
int Symmetry,
|
||||
int &touch_xmin, int &touch_xmax,
|
||||
int &touch_ymin, int &touch_ymax,
|
||||
int &touch_zmin, int &touch_zmax)
|
||||
{
|
||||
const double dX = X[1] - X[0];
|
||||
const double dY = Y[1] - Y[0];
|
||||
const double dZ = Z[1] - Z[0];
|
||||
const int NO_SYMM = 0;
|
||||
const int OCTANT = 2;
|
||||
|
||||
touch_xmax = (std::fabs(X[ex[0] - 1] - bbox[3]) < dX) ? 1 : 0;
|
||||
touch_ymax = (std::fabs(Y[ex[1] - 1] - bbox[4]) < dY) ? 1 : 0;
|
||||
touch_zmax = (std::fabs(Z[ex[2] - 1] - bbox[5]) < dZ) ? 1 : 0;
|
||||
|
||||
touch_xmin = (std::fabs(X[0] - bbox[0]) < dX &&
|
||||
!(Symmetry == OCTANT && std::fabs(bbox[0]) < dX / 2.0)) ? 1 : 0;
|
||||
touch_ymin = (std::fabs(Y[0] - bbox[1]) < dY &&
|
||||
!(Symmetry == OCTANT && std::fabs(bbox[1]) < dY / 2.0)) ? 1 : 0;
|
||||
touch_zmin = (std::fabs(Z[0] - bbox[2]) < dZ &&
|
||||
!(Symmetry > NO_SYMM && std::fabs(bbox[2]) < dZ / 2.0)) ? 1 : 0;
|
||||
}
|
||||
|
||||
static void upload_state_inputs(double **state_host, size_t all)
|
||||
{
|
||||
const size_t bytes = all * sizeof(double);
|
||||
@@ -3922,8 +4003,17 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
|
||||
const size_t all = (size_t)ex[0] * ex[1] * ex[2];
|
||||
const size_t bytes = all * sizeof(double);
|
||||
int touch_xmin = 0, touch_xmax = 0;
|
||||
int touch_ymin = 0, touch_ymax = 0;
|
||||
int touch_zmin = 0, touch_zmax = 0;
|
||||
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
|
||||
if (Lev > 0) {
|
||||
compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
|
||||
touch_xmin, touch_xmax,
|
||||
touch_ymin, touch_ymax,
|
||||
touch_zmin, touch_zmax);
|
||||
}
|
||||
StepContext &ctx = ensure_step_ctx(block_tag, all);
|
||||
const bool use_resident_state = (keep_resident_state != 0);
|
||||
if (use_resident_state) {
|
||||
@@ -3986,6 +4076,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
|
||||
|
||||
t0 = profile ? cuda_profile_now_ms() : 0.0;
|
||||
gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
|
||||
if (Lev > 0) {
|
||||
gpu_copy_patch_boundary_batch((int)all,
|
||||
touch_xmin, touch_xmax,
|
||||
touch_ymin, touch_ymax,
|
||||
touch_zmin, touch_zmax);
|
||||
}
|
||||
if (profile) {
|
||||
cuda_profile_sync();
|
||||
finalize_ms += cuda_profile_now_ms() - t0;
|
||||
|
||||
Reference in New Issue
Block a user