Fix GPU RK4 boundary and sync correctness
This commit is contained in:
@@ -83,6 +83,9 @@ struct CachedInterpPlanEntry
|
||||
{
|
||||
bool valid;
|
||||
InterpPlanKey key;
|
||||
vector<double> xvals;
|
||||
vector<double> yvals;
|
||||
vector<double> zvals;
|
||||
CachedInterpPlan plan;
|
||||
|
||||
CachedInterpPlanEntry() : valid(false) {}
|
||||
@@ -286,14 +289,28 @@ CachedInterpPlanEntry &interp_plan_cache_entry()
|
||||
bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs)
|
||||
{
|
||||
return lhs.patch == rhs.patch &&
|
||||
lhs.x == rhs.x &&
|
||||
lhs.y == rhs.y &&
|
||||
lhs.z == rhs.z &&
|
||||
lhs.NN == rhs.NN &&
|
||||
lhs.Symmetry == rhs.Symmetry &&
|
||||
lhs.myrank == rhs.myrank;
|
||||
}
|
||||
|
||||
bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX)
|
||||
{
|
||||
if (static_cast<int>(cache.xvals.size()) != NN ||
|
||||
static_cast<int>(cache.yvals.size()) != NN ||
|
||||
static_cast<int>(cache.zvals.size()) != NN)
|
||||
return false;
|
||||
|
||||
for (int j = 0; j < NN; ++j)
|
||||
{
|
||||
if (cache.xvals[j] != XX[0][j] ||
|
||||
cache.yvals[j] != XX[1][j] ||
|
||||
cache.zvals[j] != XX[2][j])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
||||
int NN, double **XX,
|
||||
int Symmetry, int myrank,
|
||||
@@ -314,11 +331,15 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch,
|
||||
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
|
||||
if (cache.valid &&
|
||||
same_interp_plan_key(cache.key, key) &&
|
||||
same_interp_plan_points(cache, NN, XX) &&
|
||||
cache.plan.nblocks == static_cast<int>(block_index.views.size()))
|
||||
return cache.plan;
|
||||
|
||||
cache.valid = true;
|
||||
cache.key = key;
|
||||
cache.xvals.assign(XX[0], XX[0] + NN);
|
||||
cache.yvals.assign(XX[1], XX[1] + NN);
|
||||
cache.zvals.assign(XX[2], XX[2] + NN);
|
||||
cache.plan = CachedInterpPlan();
|
||||
CachedInterpPlan &plan = cache.plan;
|
||||
plan.nblocks = static_cast<int>(block_index.views.size());
|
||||
@@ -412,6 +433,9 @@ void release_interp_plan_cache_internal()
|
||||
{
|
||||
CachedInterpPlanEntry &cache = interp_plan_cache_entry();
|
||||
cache.valid = false;
|
||||
cache.xvals.clear();
|
||||
cache.yvals.clear();
|
||||
cache.zvals.clear();
|
||||
cache.plan = CachedInterpPlan();
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#include "bssn_cuda_ops.h"
|
||||
#include "bssn_gpu.h"
|
||||
#include "rungekutta4_rout.h"
|
||||
#include "sommerfeld_rout.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
@@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
|
||||
{
|
||||
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
||||
if (!device_ptr)
|
||||
return true;
|
||||
|
||||
bssn_gpu_prepare_host_buffer(host_ptr, count);
|
||||
const size_t bytes = static_cast<size_t>(count) * sizeof(double);
|
||||
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error(label, err);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool copy_region_to_padded_stage(CachedBuffer &dst,
|
||||
const double *src,
|
||||
const int src_shape[3],
|
||||
@@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
double xmin, double ymin, double zmin,
|
||||
double xmax, double ymax, double zmax,
|
||||
const double *state0,
|
||||
const double *phi_field,
|
||||
const double *lap_field,
|
||||
const double *boundary_src,
|
||||
double *stage_data,
|
||||
double *rhs_accum,
|
||||
@@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
||||
}
|
||||
|
||||
if (ok)
|
||||
if (ok && lev == 0)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
|
||||
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
||||
@@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
|
||||
if (ok && lev > 0)
|
||||
{
|
||||
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
|
||||
void *args[] = {&nx, &ny, &nz,
|
||||
&has_xmin, &has_ymin, &has_zmin,
|
||||
&has_xmax, &has_ymax, &has_zmax,
|
||||
&d_state0, &d_stage};
|
||||
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
||||
double *host_state0 = const_cast<double *>(state0);
|
||||
double *host_phi = const_cast<double *>(phi_field);
|
||||
double *host_lap = const_cast<double *>(lap_field);
|
||||
double *host_rhs = rhs_accum;
|
||||
|
||||
ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") &&
|
||||
sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") &&
|
||||
sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") &&
|
||||
sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
|
||||
if (ok)
|
||||
{
|
||||
bssn_gpu_prepare_host_buffer(stage_data, n);
|
||||
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (ok)
|
||||
{
|
||||
int rk_stage_host = rk_stage;
|
||||
int cor = 1;
|
||||
f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
|
||||
f_sommerfeld_rout(ex,
|
||||
const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
|
||||
xmin, ymin, zmin, xmax, ymax, zmax,
|
||||
dT,
|
||||
host_phi, host_lap,
|
||||
host_state0, stage_data,
|
||||
const_cast<double *>(SoA),
|
||||
symmetry, cor);
|
||||
|
||||
cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess)
|
||||
{
|
||||
report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
|
||||
ok = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (ok)
|
||||
|
||||
@@ -12,6 +12,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
||||
double xmin, double ymin, double zmin,
|
||||
double xmax, double ymax, double zmax,
|
||||
const double *state0,
|
||||
const double *phi_field,
|
||||
const double *lap_field,
|
||||
const double *boundary_src,
|
||||
double *stage_data,
|
||||
double *rhs_accum,
|
||||
|
||||
@@ -2,14 +2,15 @@
|
||||
|
||||
#ifdef USE_GPU
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <vector>
|
||||
|
||||
#include "bssn_class.h"
|
||||
#include "bssn_cuda_ops.h"
|
||||
#include "bssn_gpu.h"
|
||||
#include "bssn_macro.h"
|
||||
#include "rungekutta4_rout.h"
|
||||
|
||||
void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
{
|
||||
@@ -56,11 +57,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
|
||||
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
|
||||
(void)BB;
|
||||
#if (MAPBH == 0)
|
||||
const bool need_host_stage_sync = (BH_num > 0 && lev == GH->levels - 1);
|
||||
#else
|
||||
const bool need_host_stage_sync = false;
|
||||
#endif
|
||||
double ndeps = (lev < GH->movls) ? numepsb : numepss;
|
||||
double TRK4 = PhysTime;
|
||||
int iter_count = 0;
|
||||
@@ -83,6 +79,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
patch->bbox[0], patch->bbox[1], patch->bbox[2],
|
||||
patch->bbox[3], patch->bbox[4], patch->bbox[5],
|
||||
cg->fgfs[varl0->data->sgfn],
|
||||
cg->fgfs[phi0->sgfn],
|
||||
cg->fgfs[Lap0->sgfn],
|
||||
cg->fgfs[varlb->data->sgfn],
|
||||
cg->fgfs[varls->data->sgfn],
|
||||
cg->fgfs[varlr->data->sgfn],
|
||||
@@ -124,6 +122,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
}
|
||||
};
|
||||
|
||||
auto stage_download_patch_list =
|
||||
[&](MyList<var> *var_list) {
|
||||
MyList<Patch> *patch_it = GH->PatL[lev];
|
||||
while (patch_it)
|
||||
{
|
||||
MyList<Block> *block_it = patch_it->data->blb;
|
||||
while (block_it)
|
||||
{
|
||||
Block *cg = block_it->data;
|
||||
if (myrank == cg->rank)
|
||||
stage_download_var_list(cg, var_list);
|
||||
|
||||
if (block_it == patch_it->data->ble)
|
||||
break;
|
||||
block_it = block_it->next;
|
||||
}
|
||||
if (ERROR)
|
||||
break;
|
||||
patch_it = patch_it->next;
|
||||
}
|
||||
};
|
||||
|
||||
auto ensure_stage_device_var_list =
|
||||
[&](Block *cg, MyList<var> *var_list) {
|
||||
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
|
||||
@@ -336,8 +356,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
if (!ERROR && !sync_cache_pre[lev].valid)
|
||||
stage_download_var_list(cg, SynchList_pre);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
@@ -346,8 +364,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev]))
|
||||
refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]);
|
||||
if (!ERROR)
|
||||
{
|
||||
stage_download_patch_list(SynchList_pre);
|
||||
if (!ERROR)
|
||||
bssn_gpu_clear_cached_device_buffers();
|
||||
}
|
||||
|
||||
MPI_Request err_req_pre;
|
||||
{
|
||||
@@ -357,8 +379,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
|
||||
Parallel::AsyncSyncState async_pre;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, need_host_stage_sync);
|
||||
if (!ERROR && need_host_stage_sync)
|
||||
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, true);
|
||||
if (!ERROR)
|
||||
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
|
||||
|
||||
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
|
||||
@@ -427,8 +449,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
if (!ERROR && (!sync_cache_cor[lev].valid || iter_count == 3))
|
||||
stage_download_var_list(cg, SynchList_cor);
|
||||
}
|
||||
|
||||
if (BP == Pp->data->ble)
|
||||
@@ -438,9 +458,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 &&
|
||||
!can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev]))
|
||||
refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]);
|
||||
if (!ERROR)
|
||||
{
|
||||
stage_download_patch_list(SynchList_cor);
|
||||
if (!ERROR)
|
||||
bssn_gpu_clear_cached_device_buffers();
|
||||
}
|
||||
|
||||
MPI_Request err_req_cor;
|
||||
{
|
||||
@@ -450,9 +473,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
||||
|
||||
Parallel::AsyncSyncState async_cor;
|
||||
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
||||
const bool unpack_cor_to_host = (iter_count == 3) || need_host_stage_sync;
|
||||
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, unpack_cor_to_host);
|
||||
if (!ERROR && iter_count < 3 && unpack_cor_to_host)
|
||||
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, true);
|
||||
if (!ERROR && iter_count < 3)
|
||||
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
|
||||
|
||||
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
||||
|
||||
Reference in New Issue
Block a user