937 lines
30 KiB
C
937 lines
30 KiB
C
#include "macrodef.h"
|
|
|
|
#ifdef USE_GPU
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <cstring>
|
|
#include <cstdlib>
|
|
#include <iomanip>
|
|
#include <vector>
|
|
|
|
#include "bssn_class.h"
|
|
#include "bssn_cuda_ops.h"
|
|
#include "bssn_gpu.h"
|
|
#include "bssn_macro.h"
|
|
|
|
namespace
|
|
{
|
|
enum StageProfileMetric
|
|
{
|
|
STAGE_PROFILE_TOTAL = 0,
|
|
STAGE_PROFILE_RHS,
|
|
STAGE_PROFILE_RUN_STAGE,
|
|
STAGE_PROFILE_RUN_STAGE_DEVICE,
|
|
STAGE_PROFILE_RUN_STAGE_HOST_FIX,
|
|
STAGE_PROFILE_LOWERBOUND,
|
|
STAGE_PROFILE_ENSURE,
|
|
STAGE_PROFILE_DOWNLOAD,
|
|
STAGE_PROFILE_CLEAR_CACHE,
|
|
STAGE_PROFILE_SYNC_START,
|
|
STAGE_PROFILE_SYNC_FINISH,
|
|
STAGE_PROFILE_REFRESH,
|
|
STAGE_PROFILE_COUNT
|
|
};
|
|
|
|
static const int kStageProfileMaxLevels = 32;
|
|
|
|
struct StageProfileStore
|
|
{
|
|
bool env_checked;
|
|
bool enabled;
|
|
int calls[kStageProfileMaxLevels];
|
|
double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT];
|
|
};
|
|
|
|
StageProfileStore &stage_profile_store()
|
|
{
|
|
static StageProfileStore store = {};
|
|
return store;
|
|
}
|
|
|
|
bool stage_profile_enabled()
|
|
{
|
|
StageProfileStore &store = stage_profile_store();
|
|
if (!store.env_checked)
|
|
{
|
|
const char *env = getenv("AMSS_GPU_STAGE_TIMING");
|
|
store.enabled = (env && env[0] && strcmp(env, "0") != 0);
|
|
store.env_checked = true;
|
|
}
|
|
return store.enabled;
|
|
}
|
|
|
|
void stage_profile_note_call(int lev)
|
|
{
|
|
if (lev >= 0 && lev < kStageProfileMaxLevels)
|
|
stage_profile_store().calls[lev]++;
|
|
}
|
|
|
|
void stage_profile_add(int lev, StageProfileMetric metric, double seconds)
|
|
{
|
|
if (lev >= 0 && lev < kStageProfileMaxLevels)
|
|
stage_profile_store().metric[lev][metric] += seconds;
|
|
}
|
|
|
|
const char *stage_profile_metric_name(StageProfileMetric metric)
|
|
{
|
|
switch (metric)
|
|
{
|
|
case STAGE_PROFILE_TOTAL:
|
|
return "total";
|
|
case STAGE_PROFILE_RHS:
|
|
return "rhs";
|
|
case STAGE_PROFILE_RUN_STAGE:
|
|
return "run_stage";
|
|
case STAGE_PROFILE_RUN_STAGE_DEVICE:
|
|
return "run_stage_dev";
|
|
case STAGE_PROFILE_RUN_STAGE_HOST_FIX:
|
|
return "run_stage_host";
|
|
case STAGE_PROFILE_LOWERBOUND:
|
|
return "lower";
|
|
case STAGE_PROFILE_ENSURE:
|
|
return "ensure";
|
|
case STAGE_PROFILE_DOWNLOAD:
|
|
return "download";
|
|
case STAGE_PROFILE_CLEAR_CACHE:
|
|
return "clear_cache";
|
|
case STAGE_PROFILE_SYNC_START:
|
|
return "sync_start";
|
|
case STAGE_PROFILE_SYNC_FINISH:
|
|
return "sync_finish";
|
|
case STAGE_PROFILE_REFRESH:
|
|
return "refresh";
|
|
default:
|
|
return "unknown";
|
|
}
|
|
}
|
|
} // namespace
|
|
|
|
void bssn_cuda_dump_stage_profile()
|
|
{
|
|
if (!stage_profile_enabled())
|
|
return;
|
|
|
|
int myrank = 0;
|
|
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
|
|
|
|
StageProfileStore &store = stage_profile_store();
|
|
int global_calls_sum[kStageProfileMaxLevels] = {};
|
|
double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
|
|
double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {};
|
|
|
|
MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
|
|
MPI_Reduce(store.metric[0], global_metric_sum[0],
|
|
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
|
|
MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
|
|
MPI_Reduce(store.metric[0], global_metric_max[0],
|
|
kStageProfileMaxLevels * STAGE_PROFILE_COUNT,
|
|
MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
|
|
|
|
if (myrank != 0)
|
|
return;
|
|
|
|
cout << endl;
|
|
cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl;
|
|
cout << " lev calls";
|
|
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
|
|
cout << " " << setw(22) << stage_profile_metric_name(static_cast<StageProfileMetric>(metric));
|
|
cout << endl;
|
|
|
|
for (int lev = 0; lev < kStageProfileMaxLevels; ++lev)
|
|
{
|
|
if (global_calls_sum[lev] == 0)
|
|
continue;
|
|
|
|
cout << setw(4) << lev << " " << setw(5) << global_calls_sum[lev];
|
|
for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric)
|
|
{
|
|
cout << " "
|
|
<< setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric]
|
|
<< "/"
|
|
<< setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric];
|
|
}
|
|
cout << endl;
|
|
}
|
|
cout << endl;
|
|
}
|
|
|
|
void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|
{
|
|
#ifdef WithShell
|
|
#error "Step_MainPath_GPU currently supports Patch grids only."
|
|
#endif
|
|
|
|
const bool profile_enabled = stage_profile_enabled();
|
|
const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0;
|
|
if (profile_enabled)
|
|
stage_profile_note_call(lev);
|
|
|
|
if (bssn_gpu_bind_process_device(myrank))
|
|
{
|
|
cerr << "GPU device bind failure on MPI rank " << myrank << endl;
|
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
}
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
bssn_gpu_clear_cached_device_buffers();
|
|
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
bssn_gpu_clear_cached_device_buffers();
|
|
|
|
setpbh(BH_num, Porg0, Mass, BH_num_input);
|
|
|
|
const double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
|
|
|
#if (MAPBH == 1)
|
|
if (BH_num > 0 && lev == GH->levels - 1)
|
|
{
|
|
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
|
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
|
{
|
|
for (int ith = 0; ith < 3; ith++)
|
|
Porg1[ithBH][ith] = Porg0[ithBH][ith] + Porg_rhs[ithBH][ith] * dT_lev;
|
|
if (Symmetry > 0)
|
|
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
|
|
if (Symmetry == 2)
|
|
{
|
|
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
|
|
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (lev == a_lev)
|
|
AnalysisStuff(lev, dT_lev);
|
|
#endif
|
|
|
|
#ifdef With_AHF
|
|
AH_Step_Find(lev, dT_lev);
|
|
#endif
|
|
|
|
const bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
|
|
(void)BB;
|
|
double ndeps = (lev < GH->movls) ? numepsb : numepss;
|
|
double TRK4 = PhysTime;
|
|
int iter_count = 0;
|
|
int pre = 0, cor = 1;
|
|
int ERROR = 0;
|
|
const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0);
|
|
|
|
auto run_stage_on_block =
|
|
[&](Block *cg, Patch *patch, MyList<var> *state0_list,
|
|
MyList<var> *boundary_src_list, MyList<var> *stage_data_list,
|
|
MyList<var> *rhs_list, int rk_stage) {
|
|
MyList<var> *varl0 = state0_list;
|
|
MyList<var> *varlb = boundary_src_list;
|
|
MyList<var> *varls = stage_data_list;
|
|
MyList<var> *varlr = rhs_list;
|
|
std::vector<const double *> batch_state0;
|
|
std::vector<double *> batch_stage;
|
|
std::vector<double *> batch_rhs;
|
|
|
|
while (varl0)
|
|
{
|
|
const bool force_host_boundary_fix = false;
|
|
const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix;
|
|
if (can_batch_device_path)
|
|
{
|
|
batch_state0.push_back(cg->fgfs[varl0->data->sgfn]);
|
|
batch_stage.push_back(cg->fgfs[varls->data->sgfn]);
|
|
batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]);
|
|
varl0 = varl0->next;
|
|
varlb = varlb->next;
|
|
varls = varls->next;
|
|
varlr = varlr->next;
|
|
continue;
|
|
}
|
|
|
|
const double var_begin = profile_enabled ? MPI_Wtime() : 0.0;
|
|
if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev,
|
|
cg->X[0], cg->X[1], cg->X[2],
|
|
patch->bbox[0], patch->bbox[1], patch->bbox[2],
|
|
patch->bbox[3], patch->bbox[4], patch->bbox[5],
|
|
cg->fgfs[varl0->data->sgfn],
|
|
cg->fgfs[phi0->sgfn],
|
|
cg->fgfs[Lap0->sgfn],
|
|
cg->fgfs[varlb->data->sgfn],
|
|
cg->fgfs[varls->data->sgfn],
|
|
cg->fgfs[varlr->data->sgfn],
|
|
varl0->data->propspeed,
|
|
varl0->data->SoA,
|
|
Symmetry, lev, rk_stage,
|
|
force_host_boundary_fix, false))
|
|
{
|
|
cerr << "GPU rk4/boundary failure: lev=" << lev
|
|
<< " rk_stage=" << rk_stage
|
|
<< " var=" << varl0->data->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
break;
|
|
}
|
|
if (profile_enabled)
|
|
{
|
|
stage_profile_add(lev,
|
|
force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX
|
|
: STAGE_PROFILE_RUN_STAGE_DEVICE,
|
|
MPI_Wtime() - var_begin);
|
|
}
|
|
varl0 = varl0->next;
|
|
varlb = varlb->next;
|
|
varls = varls->next;
|
|
varlr = varlr->next;
|
|
}
|
|
|
|
if (!ERROR && !batch_state0.empty())
|
|
{
|
|
const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0;
|
|
if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev,
|
|
cg->X[0], cg->X[1], cg->X[2],
|
|
patch->bbox[0], patch->bbox[1], patch->bbox[2],
|
|
patch->bbox[3], patch->bbox[4], patch->bbox[5],
|
|
Symmetry,
|
|
&batch_state0[0],
|
|
&batch_stage[0],
|
|
&batch_rhs[0],
|
|
static_cast<int>(batch_state0.size()),
|
|
rk_stage, false))
|
|
{
|
|
cerr << "GPU rk4/boundary batch failure: lev=" << lev
|
|
<< " rk_stage=" << rk_stage
|
|
<< " vars=" << batch_state0.size()
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
}
|
|
else if (profile_enabled)
|
|
{
|
|
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin);
|
|
}
|
|
}
|
|
};
|
|
|
|
auto stage_download_var_list =
|
|
[&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
|
|
std::vector<double *> batch_host_ptrs;
|
|
std::vector<MyList<var> *> batch_vars;
|
|
while (var_list)
|
|
{
|
|
double *host_ptr = cg->fgfs[var_list->data->sgfn];
|
|
if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr))
|
|
{
|
|
var_list = var_list->next;
|
|
continue;
|
|
}
|
|
batch_host_ptrs.push_back(host_ptr);
|
|
batch_vars.push_back(var_list);
|
|
var_list = var_list->next;
|
|
}
|
|
if (!batch_host_ptrs.empty() &&
|
|
bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
|
|
static_cast<int>(batch_host_ptrs.size())))
|
|
{
|
|
for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
|
|
{
|
|
if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
|
|
{
|
|
cerr << "GPU stage download failure: lev=" << lev
|
|
<< " var=" << batch_vars[i]->data->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
auto stage_download_patch_list =
|
|
[&](MyList<var> *var_list, bool skip_unmapped) {
|
|
MyList<Patch> *patch_it = GH->PatL[lev];
|
|
while (patch_it)
|
|
{
|
|
MyList<Block> *block_it = patch_it->data->blb;
|
|
while (block_it)
|
|
{
|
|
Block *cg = block_it->data;
|
|
if (myrank == cg->rank)
|
|
stage_download_var_list(cg, var_list, skip_unmapped);
|
|
|
|
if (block_it == patch_it->data->ble)
|
|
break;
|
|
block_it = block_it->next;
|
|
}
|
|
if (ERROR)
|
|
break;
|
|
patch_it = patch_it->next;
|
|
}
|
|
};
|
|
|
|
auto ensure_stage_device_var_list =
|
|
[&](Block *cg, MyList<var> *var_list) {
|
|
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
|
|
while (var_list)
|
|
{
|
|
double *host_ptr = cg->fgfs[var_list->data->sgfn];
|
|
if (!bssn_gpu_find_device_buffer(host_ptr) &&
|
|
bssn_gpu_stage_upload_buffer(host_ptr, n))
|
|
{
|
|
cerr << "GPU state ensure failure: lev=" << lev
|
|
<< " var=" << var_list->data->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
break;
|
|
}
|
|
var_list = var_list->next;
|
|
}
|
|
};
|
|
|
|
auto refresh_synced_device_regions =
|
|
[&](Block *cg, MyList<var> *var_list, Parallel::SyncCache &cache) {
|
|
std::vector<Parallel::gridseg *> local_segments;
|
|
for (int node = 0; node < cache.cpusize; ++node)
|
|
{
|
|
MyList<Parallel::gridseg> *seg = cache.combined_dst[node];
|
|
while (seg)
|
|
{
|
|
if (seg->data && seg->data->Bg == cg)
|
|
local_segments.push_back(seg->data);
|
|
seg = seg->next;
|
|
}
|
|
}
|
|
|
|
if (local_segments.empty())
|
|
return;
|
|
|
|
const int n = cg->shape[0] * cg->shape[1] * cg->shape[2];
|
|
while (var_list)
|
|
{
|
|
double *host_ptr = cg->fgfs[var_list->data->sgfn];
|
|
if (!bssn_gpu_find_device_buffer(host_ptr))
|
|
{
|
|
if (bssn_gpu_stage_upload_buffer(host_ptr, n))
|
|
{
|
|
cerr << "GPU sync refresh upload failure: lev=" << lev
|
|
<< " var=" << var_list->data->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
break;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (size_t i = 0; i < local_segments.size(); ++i)
|
|
{
|
|
Parallel::gridseg *seg = local_segments[i];
|
|
if (bssn_gpu_stage_upload_region(host_ptr,
|
|
cg->shape,
|
|
cg->bbox,
|
|
cg->bbox + dim,
|
|
seg->shape,
|
|
seg->llb))
|
|
{
|
|
cerr << "GPU sync region refresh failure: lev=" << lev
|
|
<< " var=" << var_list->data->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
break;
|
|
}
|
|
}
|
|
if (ERROR)
|
|
break;
|
|
}
|
|
var_list = var_list->next;
|
|
}
|
|
};
|
|
|
|
auto refresh_stage_device_after_sync =
|
|
[&](MyList<var> *var_list, Parallel::SyncCache &cache) {
|
|
MyList<Patch> *patch_it = GH->PatL[lev];
|
|
while (patch_it)
|
|
{
|
|
MyList<Block> *block_it = patch_it->data->blb;
|
|
while (block_it)
|
|
{
|
|
Block *cg = block_it->data;
|
|
if (myrank == cg->rank)
|
|
refresh_synced_device_regions(cg, var_list, cache);
|
|
|
|
if (block_it == patch_it->data->ble)
|
|
break;
|
|
block_it = block_it->next;
|
|
}
|
|
if (ERROR)
|
|
break;
|
|
patch_it = patch_it->next;
|
|
}
|
|
};
|
|
|
|
auto refresh_stage_host_before_sync =
|
|
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
|
|
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
|
|
return false;
|
|
|
|
MyList<Patch> *patch_it = GH->PatL[lev];
|
|
while (patch_it)
|
|
{
|
|
MyList<Block> *block_it = patch_it->data->blb;
|
|
while (block_it)
|
|
{
|
|
Block *cg = block_it->data;
|
|
if (myrank == cg->rank)
|
|
{
|
|
std::vector<Parallel::gridseg *> local_segments;
|
|
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
|
|
while (seg)
|
|
{
|
|
if (seg->data && seg->data->Bg == cg)
|
|
local_segments.push_back(seg->data);
|
|
seg = seg->next;
|
|
}
|
|
|
|
if (!local_segments.empty())
|
|
{
|
|
MyList<var> *var_it = var_list;
|
|
while (var_it)
|
|
{
|
|
double *host_ptr = cg->fgfs[var_it->data->sgfn];
|
|
for (size_t i = 0; i < local_segments.size(); ++i)
|
|
{
|
|
Parallel::gridseg *src_seg = local_segments[i];
|
|
if (bssn_gpu_stage_download_region(host_ptr,
|
|
cg->shape,
|
|
cg->bbox,
|
|
cg->bbox + dim,
|
|
src_seg->shape,
|
|
src_seg->llb))
|
|
{
|
|
cerr << "GPU sync region download failure: lev=" << lev
|
|
<< " var=" << var_it->data->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
return true;
|
|
}
|
|
}
|
|
var_it = var_it->next;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (block_it == patch_it->data->ble)
|
|
break;
|
|
block_it = block_it->next;
|
|
}
|
|
patch_it = patch_it->next;
|
|
}
|
|
|
|
return true;
|
|
};
|
|
|
|
auto can_pack_sync_from_device =
|
|
[&](MyList<var> *var_list, Parallel::SyncCache &cache) -> bool {
|
|
if (!cache.valid || !cache.combined_src || myrank < 0 || myrank >= cache.cpusize)
|
|
return false;
|
|
|
|
MyList<Parallel::gridseg> *seg = cache.combined_src[myrank];
|
|
while (seg)
|
|
{
|
|
MyList<var> *var_it = var_list;
|
|
while (var_it)
|
|
{
|
|
if (!bssn_gpu_find_device_buffer(seg->data->Bg->fgfs[var_it->data->sgfn]))
|
|
return false;
|
|
var_it = var_it->next;
|
|
}
|
|
seg = seg->next;
|
|
}
|
|
return true;
|
|
};
|
|
|
|
MyList<Patch> *Pp = GH->PatL[lev];
|
|
while (Pp)
|
|
{
|
|
MyList<Block> *BP = Pp->data->blb;
|
|
while (BP)
|
|
{
|
|
Block *cg = BP->data;
|
|
if (myrank == cg->rank)
|
|
{
|
|
double t0 = 0.0;
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME))
|
|
ERROR = 1;
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
|
|
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count);
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
|
|
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false))
|
|
{
|
|
cerr << "GPU lowerbound failure: lev=" << lev
|
|
<< " rk_stage=" << iter_count
|
|
<< " var=" << phi->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
}
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
|
|
}
|
|
if (BP == Pp->data->ble)
|
|
break;
|
|
BP = BP->next;
|
|
}
|
|
Pp = Pp->next;
|
|
}
|
|
|
|
if (!ERROR)
|
|
{
|
|
if (!keep_stage_sync_on_device)
|
|
{
|
|
double t0 = 0.0;
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
stage_download_patch_list(SynchList_pre, false);
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
|
|
if (!ERROR)
|
|
{
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
bssn_gpu_clear_cached_device_buffers();
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
|
|
}
|
|
}
|
|
}
|
|
|
|
MPI_Request err_req_pre;
|
|
{
|
|
int erh = ERROR;
|
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_pre);
|
|
}
|
|
|
|
Parallel::AsyncSyncState async_pre;
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
|
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre);
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
|
|
!keep_stage_sync_on_device);
|
|
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry,
|
|
!keep_stage_sync_on_device);
|
|
if (!ERROR && !keep_stage_sync_on_device)
|
|
{
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
|
|
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]);
|
|
}
|
|
|
|
MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE);
|
|
if (ERROR)
|
|
{
|
|
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT_lev);
|
|
if (myrank == 0)
|
|
{
|
|
if (ErrorMonitor->outfile)
|
|
ErrorMonitor->outfile << "find NaN in state variables at t = " << PhysTime
|
|
<< ", lev = " << lev << endl;
|
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
}
|
|
}
|
|
|
|
#if (MAPBH == 0)
|
|
if (BH_num > 0 && lev == GH->levels - 1)
|
|
{
|
|
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
|
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
|
{
|
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
|
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
|
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
|
|
if (Symmetry > 0)
|
|
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
|
|
if (Symmetry == 2)
|
|
{
|
|
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
|
|
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (lev == a_lev)
|
|
AnalysisStuff(lev, dT_lev);
|
|
#endif
|
|
|
|
for (iter_count = 1; iter_count < 4; iter_count++)
|
|
{
|
|
if (iter_count == 1 || iter_count == 3)
|
|
TRK4 += dT_lev / 2;
|
|
|
|
Pp = GH->PatL[lev];
|
|
while (Pp)
|
|
{
|
|
MyList<Block> *BP = Pp->data->blb;
|
|
while (BP)
|
|
{
|
|
Block *cg = BP->data;
|
|
if (myrank == cg->rank)
|
|
{
|
|
double t0 = 0.0;
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
ensure_stage_device_var_list(cg, SynchList_pre);
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0);
|
|
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN))
|
|
ERROR = 1;
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0);
|
|
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count);
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0);
|
|
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false))
|
|
{
|
|
cerr << "GPU lowerbound failure: lev=" << lev
|
|
<< " rk_stage=" << iter_count
|
|
<< " var=" << phi1->name
|
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
ERROR = 1;
|
|
}
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0);
|
|
}
|
|
|
|
if (BP == Pp->data->ble)
|
|
break;
|
|
BP = BP->next;
|
|
}
|
|
Pp = Pp->next;
|
|
}
|
|
|
|
if (!ERROR)
|
|
{
|
|
if (!keep_stage_sync_on_device)
|
|
{
|
|
double t0 = 0.0;
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
stage_download_patch_list(SynchList_cor, false);
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
|
|
if (!ERROR)
|
|
{
|
|
if (profile_enabled)
|
|
t0 = MPI_Wtime();
|
|
bssn_gpu_clear_cached_device_buffers();
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
|
|
}
|
|
}
|
|
}
|
|
|
|
MPI_Request err_req_cor;
|
|
{
|
|
int erh = ERROR;
|
|
MPI_Iallreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD, &err_req_cor);
|
|
}
|
|
|
|
Parallel::AsyncSyncState async_cor;
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
|
stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor);
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
|
|
!keep_stage_sync_on_device);
|
|
stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry,
|
|
!keep_stage_sync_on_device);
|
|
if (!ERROR && !keep_stage_sync_on_device && iter_count < 3)
|
|
{
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
|
|
stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]);
|
|
}
|
|
|
|
MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);
|
|
if (ERROR)
|
|
{
|
|
Parallel::Dump_Data(GH->PatL[lev], SynchList_pre, 0, PhysTime, dT_lev);
|
|
if (myrank == 0)
|
|
{
|
|
if (ErrorMonitor->outfile)
|
|
ErrorMonitor->outfile << "find NaN in RK4 substep#" << iter_count
|
|
<< " variables at t = " << PhysTime
|
|
<< ", lev = " << lev << endl;
|
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
|
}
|
|
}
|
|
|
|
#if (MAPBH == 0)
|
|
if (BH_num > 0 && lev == GH->levels - 1)
|
|
{
|
|
compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
|
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
|
{
|
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
|
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
|
|
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
|
|
if (Symmetry > 0)
|
|
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
|
|
if (Symmetry == 2)
|
|
{
|
|
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
|
|
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (iter_count < 3)
|
|
{
|
|
Pp = GH->PatL[lev];
|
|
while (Pp)
|
|
{
|
|
MyList<Block> *BP = Pp->data->blb;
|
|
while (BP)
|
|
{
|
|
BP->data->swapList(SynchList_pre, SynchList_cor, myrank);
|
|
if (BP == Pp->data->ble)
|
|
break;
|
|
BP = BP->next;
|
|
}
|
|
Pp = Pp->next;
|
|
}
|
|
|
|
#if (MAPBH == 0)
|
|
if (BH_num > 0 && lev == GH->levels - 1)
|
|
{
|
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
|
{
|
|
Porg[ithBH][0] = Porg1[ithBH][0];
|
|
Porg[ithBH][1] = Porg1[ithBH][1];
|
|
Porg[ithBH][2] = Porg1[ithBH][2];
|
|
}
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
|
|
#if (RPS == 0)
|
|
RestrictProlong(lev, YN, BB);
|
|
#endif
|
|
|
|
Pp = GH->PatL[lev];
|
|
while (Pp)
|
|
{
|
|
MyList<Block> *BP = Pp->data->blb;
|
|
while (BP)
|
|
{
|
|
Block *cg = BP->data;
|
|
cg->swapList(StateList, SynchList_cor, myrank);
|
|
cg->swapList(OldStateList, SynchList_cor, myrank);
|
|
if (BP == Pp->data->ble)
|
|
break;
|
|
BP = BP->next;
|
|
}
|
|
Pp = Pp->next;
|
|
}
|
|
|
|
if (!ERROR && keep_stage_sync_on_device)
|
|
{
|
|
// After the swaps above, only StateList points at arrays updated during this step.
|
|
// OldStateList/SynchList_cor remain valid on host because their backing arrays were
|
|
// read-only during the RK step, and SynchList_pre is reused only as scratch later.
|
|
const double t0 = profile_enabled ? MPI_Wtime() : 0.0;
|
|
stage_download_patch_list(StateList, true);
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0);
|
|
}
|
|
|
|
if (profile_enabled)
|
|
{
|
|
const double t0 = MPI_Wtime();
|
|
bssn_gpu_clear_cached_device_buffers();
|
|
stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0);
|
|
}
|
|
else
|
|
bssn_gpu_clear_cached_device_buffers();
|
|
|
|
if (BH_num > 0 && lev == GH->levels - 1)
|
|
{
|
|
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
|
{
|
|
Porg0[ithBH][0] = Porg1[ithBH][0];
|
|
Porg0[ithBH][1] = Porg1[ithBH][1];
|
|
Porg0[ithBH][2] = Porg1[ithBH][2];
|
|
}
|
|
}
|
|
|
|
if (profile_enabled)
|
|
stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin);
|
|
}
|
|
|
|
#endif
|