diff --git a/AMSS_NCKU_source/ABE.C b/AMSS_NCKU_source/ABE.C index 83d2032..6cc6f61 100644 --- a/AMSS_NCKU_source/ABE.C +++ b/AMSS_NCKU_source/ABE.C @@ -23,7 +23,10 @@ using namespace std; #include #include "misc.h" -#include "macrodef.h" +#include "macrodef.h" +#ifdef USE_GPU +extern void bssn_cuda_dump_stage_profile(); +#endif #ifndef ABEtype #error "not define ABEtype" @@ -469,10 +472,13 @@ int main(int argc, char *argv[]) cout << endl; } - ADM->Evolve(Steps); - - if (myrank == 0) - { + ADM->Evolve(Steps); +#ifdef USE_GPU + bssn_cuda_dump_stage_profile(); +#endif + + if (myrank == 0) + { cout << endl; cout << " Total Evolve Time: " << MPI_Wtime() - End_clock << " seconds!" << endl; cout << " Total Running Time: " << MPI_Wtime() - Begin_clock << " seconds!" << endl; diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index f61ddf4..59eefd2 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -50,6 +50,12 @@ struct CachedIntBuffer size_t capacity = 0; }; +struct CachedPtrBuffer +{ + void *ptr = nullptr; + size_t capacity = 0; +}; + inline void release_buffer(CachedBuffer &buffer) { if (buffer.ptr) @@ -74,6 +80,18 @@ inline void release_buffer(CachedIntBuffer &buffer) buffer.capacity = 0; } +inline void release_buffer(CachedPtrBuffer &buffer) +{ + if (buffer.ptr) + { + cudaError_t free_err = cudaFree(buffer.ptr); + if (free_err != cudaSuccess) + report_cuda_error("cudaFree", free_err); + buffer.ptr = nullptr; + } + buffer.capacity = 0; +} + inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes) { if (bytes <= buffer.capacity && buffer.ptr) @@ -124,6 +142,31 @@ inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes) return true; } +inline bool ensure_capacity(CachedPtrBuffer &buffer, size_t bytes) +{ + if (bytes <= buffer.capacity && buffer.ptr) + return true; + + if (buffer.ptr) + { + cudaError_t free_err = cudaFree(buffer.ptr); + if (free_err != cudaSuccess) + report_cuda_error("cudaFree", free_err); + buffer.ptr = nullptr; + buffer.capacity = 0; + } + + cudaError_t err = cudaMalloc(&buffer.ptr, bytes); + if (err != cudaSuccess) + { + report_cuda_error("cudaMalloc", err); + return false; + } + + buffer.capacity = bytes; + return true; +} + struct Rk4VarCache { CachedBuffer X, Y, Z; @@ -169,6 +212,13 @@ struct InterpBatchCache InterpStencilCacheEntry stencil_entry; }; +struct Rk4BatchCache +{ + CachedPtrBuffer state0_ptrs; + CachedPtrBuffer stage_ptrs; + CachedPtrBuffer rhs_ptrs; +}; + std::unordered_map &rk4_var_cache_map() { static thread_local std::unordered_map cache_map; @@ -181,6 +231,12 @@ InterpBatchCache &interp_batch_cache() return cache; } +Rk4BatchCache &rk4_batch_cache() +{ + static thread_local Rk4BatchCache cache; + return cache; +} + inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry) { release_buffer(entry.weights); @@ -791,6 +847,63 @@ __global__ void copy_physical_boundary_kernel(int nx, int ny, int nz, } } +__global__ void rk4_boundary_batch_kernel(int n, int nx, int ny, int nz, + int has_xmin, int has_ymin, int has_zmin, + int has_xmax, int has_ymax, int has_zmax, + int num_var, double dT, + const double *const *state0_list, + double *const *stage_list, + double *const *rhs_list, + int stage) +{ + const double half = 0.5; + const double one_sixth = 1.0 / 6.0; + + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) + { + const int plane = nx * ny; + const int k = idx / plane; + const int rem = idx - k * plane; + const int j = rem / nx; + const int i = rem - j * nx; + const bool is_boundary = + (has_xmin && i == 0) || (has_xmax && i == nx - 1) || + (has_ymin && j == 0) || (has_ymax && j == ny - 1) || + (has_zmin && k == 0) || (has_zmax && k == nz - 1); + + for (int v = 0; v < num_var; ++v) + { + const double *f0 = state0_list[v]; + double *f1 = stage_list[v]; + double *rhs = rhs_list[v]; + + double out; + if (stage == 0) + { + out = f0[idx] + half * dT * rhs[idx]; + } + else if (stage == 1) + { + rhs[idx] += 2.0 * f1[idx]; + out = f0[idx] + half * dT * f1[idx]; + } + else if (stage == 2) + { + rhs[idx] += 2.0 * f1[idx]; + out = f0[idx] + dT * f1[idx]; + } + else + { + out = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]); + } + + if (is_boundary) + out = f0[idx]; + f1[idx] = out; + } + } +} + __global__ void sommerfeld_bam_kernel(int nx, int ny, int nz, const double *X, const double *Y, const double *Z, double xmin, double ymin, double zmin, @@ -1032,6 +1145,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int symmetry, int lev, int rk_stage, + bool force_host_boundary_fix, bool download_to_host) { Rk4VarCache &cache = rk4_var_cache_map()[state0]; @@ -1166,24 +1280,34 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args); } - if (ok && lev == 0) + if (ok && (lev == 0 || !force_host_boundary_fix)) { double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr; void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; ok = launch_kernel(grid, block, (const void *)rk4_kernel, args); } - if (ok && lev > 0) + if (ok && lev > 0 && !force_host_boundary_fix) + { + double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr; + void *args[] = {&nx, &ny, &nz, + &has_xmin, &has_ymin, &has_zmin, + &has_xmax, &has_ymax, &has_zmax, + &d_state0, &d_stage}; + ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args); + } + + if (ok && lev > 0 && force_host_boundary_fix) { double *host_state0 = const_cast(state0); double *host_phi = const_cast(phi_field); double *host_lap = const_cast(lap_field); double *host_rhs = rhs_accum; - ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") && - sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") && - sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") && - sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum"); + // state0/phi/lap are read-only during the current RK step, so the host copies + // remain valid even if cached device mirrors exist. Only the RHS accumulator + // is updated on device and must be synchronized back for the CPU fallback. + ok = sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum"); if (ok) { bssn_gpu_prepare_host_buffer(stage_data, n); @@ -1232,6 +1356,176 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, return ok ? 0 : 1; } +int bssn_cuda_rk4_boundary_batch(int *ex, double dT, + const double *X, const double *Y, const double *Z, + double xmin, double ymin, double zmin, + double xmax, double ymax, double zmax, + int symmetry, + const double *const *state0_list, + double *const *stage_data_list, + double *const *rhs_accum_list, + int num_var, + int rk_stage, + bool download_to_host) +{ + if (!state0_list || !stage_data_list || !rhs_accum_list || num_var <= 0) + return 1; + + const int nx = ex[0]; + const int ny = ex[1]; + const int nz = ex[2]; + const int n = count_points(ex); + const size_t bytes = static_cast(n) * sizeof(double); + const size_t ptr_bytes = static_cast(num_var) * sizeof(double *); + dim3 block(256); + dim3 grid(div_up(n, static_cast(block.x))); + + std::vector host_state0_ptrs(num_var); + std::vector host_stage_ptrs(num_var); + std::vector host_rhs_ptrs(num_var); + + bool ok = true; + for (int v = 0; v < num_var && ok; ++v) + { + const double *state0 = state0_list[v]; + double *stage_data = stage_data_list[v]; + double *rhs_accum = rhs_accum_list[v]; + Rk4VarCache &cache = rk4_var_cache_map()[state0]; + + const bool refresh_state0 = + (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz; + const bool refresh_rhs = + (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum; + const bool need_stage_input = (rk_stage != 0); + double *stage_ptr = nullptr; + const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr; + const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr; + const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr; + + if (refresh_state0 && !mapped_state0_ptr) + bssn_gpu_prepare_host_buffer(state0, n); + if (need_stage_input && !mapped_stage_ptr) + bssn_gpu_prepare_host_buffer(stage_data, n); + if (refresh_rhs && !mapped_rhs_ptr) + bssn_gpu_prepare_host_buffer(rhs_accum, n); + + ok = (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) && + (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes)); + if (!ok) + break; + + if (need_stage_input) + { + if (mapped_stage_ptr) + { + stage_ptr = const_cast(mapped_stage_ptr); + } + else + { + ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes); + stage_ptr = cache.stage.ptr; + } + } + else + { + ok = ensure_capacity(cache.stage, bytes); + stage_ptr = cache.stage.ptr; + } + if (!ok) + break; + + if (refresh_state0) + { + cache.host_state0 = state0; + cache.nx = nx; + cache.ny = ny; + cache.nz = nz; + bssn_gpu_register_device_buffer(state0, cache.state0.ptr); + } + if (refresh_rhs) + { + cache.host_rhs = rhs_accum; + cache.rhs_resident = true; + bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr); + } + + host_state0_ptrs[v] = cache.state0.ptr; + host_stage_ptrs[v] = stage_ptr; + host_rhs_ptrs[v] = cache.rhs.ptr; + } + + if (!ok) + return 1; + + Rk4BatchCache &batch_cache = rk4_batch_cache(); + ok = ensure_capacity(batch_cache.state0_ptrs, ptr_bytes) && + ensure_capacity(batch_cache.stage_ptrs, ptr_bytes) && + ensure_capacity(batch_cache.rhs_ptrs, ptr_bytes); + if (!ok) + return 1; + + cudaError_t err = cudaMemcpy(batch_cache.state0_ptrs.ptr, &host_state0_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D) batch state0 ptrs", err); + return 1; + } + err = cudaMemcpy(batch_cache.stage_ptrs.ptr, &host_stage_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D) batch stage ptrs", err); + return 1; + } + err = cudaMemcpy(batch_cache.rhs_ptrs.ptr, &host_rhs_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D) batch rhs ptrs", err); + return 1; + } + + double dX = X[1] - X[0]; + double dY = Y[1] - Y[0]; + double dZ = Z[1] - Z[0]; + const int no_symm = 0, octant = 2; + int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX); + int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY); + int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ); + int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0); + int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0); + int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0); + + int n_arg = n, nx_arg = nx, ny_arg = ny, nz_arg = nz; + int num_var_arg = num_var, rk_stage_arg = rk_stage; + void *args[] = {&n_arg, &nx_arg, &ny_arg, &nz_arg, + &has_xmin, &has_ymin, &has_zmin, + &has_xmax, &has_ymax, &has_zmax, + &num_var_arg, &dT, + &batch_cache.state0_ptrs.ptr, + &batch_cache.stage_ptrs.ptr, + &batch_cache.rhs_ptrs.ptr, + &rk_stage_arg}; + ok = launch_kernel(grid, block, (const void *)rk4_boundary_batch_kernel, args); + + if (!ok) + return 1; + + for (int v = 0; v < num_var; ++v) + { + bssn_gpu_register_device_buffer(stage_data_list[v], host_stage_ptrs[v]); + if (download_to_host) + { + err = cudaMemcpy(stage_data_list[v], host_stage_ptrs[v], bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(D2H) batch stage_data", err); + return 1; + } + } + } + + return 0; +} + void bssn_cuda_release_rk4_caches() { std::unordered_map &cache_map = rk4_var_cache_map(); @@ -1248,6 +1542,9 @@ void bssn_cuda_release_rk4_caches() release_buffer(cache.rhs); } cache_map.clear(); + release_buffer(rk4_batch_cache().state0_ptrs); + release_buffer(rk4_batch_cache().stage_ptrs); + release_buffer(rk4_batch_cache().rhs_ptrs); } void bssn_cuda_release_interp_caches() diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h index 5d86f18..993f418 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.h +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -22,8 +22,21 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, int symmetry, int lev, int rk_stage, + bool force_host_boundary_fix, bool download_to_host = true); +int bssn_cuda_rk4_boundary_batch(int *ex, double dT, + const double *X, const double *Y, const double *Z, + double xmin, double ymin, double zmin, + double xmax, double ymax, double zmax, + int symmetry, + const double *const *state0_list, + double *const *stage_data_list, + double *const *rhs_accum_list, + int num_var, + int rk_stage, + bool download_to_host = false); + int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host = true); int bssn_cuda_download_buffer(int *ex, double *host_ptr); void bssn_cuda_release_rk4_caches(); diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index 6ee1fec..37a9469 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -4,7 +4,9 @@ #include #include +#include #include +#include #include #include "bssn_class.h" @@ -12,18 +14,172 @@ #include "bssn_gpu.h" #include "bssn_macro.h" +namespace +{ +enum StageProfileMetric +{ + STAGE_PROFILE_TOTAL = 0, + STAGE_PROFILE_RHS, + STAGE_PROFILE_RUN_STAGE, + STAGE_PROFILE_RUN_STAGE_DEVICE, + STAGE_PROFILE_RUN_STAGE_HOST_FIX, + STAGE_PROFILE_LOWERBOUND, + STAGE_PROFILE_ENSURE, + STAGE_PROFILE_DOWNLOAD, + STAGE_PROFILE_CLEAR_CACHE, + STAGE_PROFILE_SYNC_START, + STAGE_PROFILE_SYNC_FINISH, + STAGE_PROFILE_REFRESH, + STAGE_PROFILE_COUNT +}; + +static const int kStageProfileMaxLevels = 32; + +struct StageProfileStore +{ + bool env_checked; + bool enabled; + int calls[kStageProfileMaxLevels]; + double metric[kStageProfileMaxLevels][STAGE_PROFILE_COUNT]; +}; + +StageProfileStore &stage_profile_store() +{ + static StageProfileStore store = {}; + return store; +} + +bool stage_profile_enabled() +{ + StageProfileStore &store = stage_profile_store(); + if (!store.env_checked) + { + const char *env = getenv("AMSS_GPU_STAGE_TIMING"); + store.enabled = (env && env[0] && strcmp(env, "0") != 0); + store.env_checked = true; + } + return store.enabled; +} + +void stage_profile_note_call(int lev) +{ + if (lev >= 0 && lev < kStageProfileMaxLevels) + stage_profile_store().calls[lev]++; +} + +void stage_profile_add(int lev, StageProfileMetric metric, double seconds) +{ + if (lev >= 0 && lev < kStageProfileMaxLevels) + stage_profile_store().metric[lev][metric] += seconds; +} + +const char *stage_profile_metric_name(StageProfileMetric metric) +{ + switch (metric) + { + case STAGE_PROFILE_TOTAL: + return "total"; + case STAGE_PROFILE_RHS: + return "rhs"; + case STAGE_PROFILE_RUN_STAGE: + return "run_stage"; + case STAGE_PROFILE_RUN_STAGE_DEVICE: + return "run_stage_dev"; + case STAGE_PROFILE_RUN_STAGE_HOST_FIX: + return "run_stage_host"; + case STAGE_PROFILE_LOWERBOUND: + return "lower"; + case STAGE_PROFILE_ENSURE: + return "ensure"; + case STAGE_PROFILE_DOWNLOAD: + return "download"; + case STAGE_PROFILE_CLEAR_CACHE: + return "clear_cache"; + case STAGE_PROFILE_SYNC_START: + return "sync_start"; + case STAGE_PROFILE_SYNC_FINISH: + return "sync_finish"; + case STAGE_PROFILE_REFRESH: + return "refresh"; + default: + return "unknown"; + } +} +} // namespace + +void bssn_cuda_dump_stage_profile() +{ + if (!stage_profile_enabled()) + return; + + int myrank = 0; + MPI_Comm_rank(MPI_COMM_WORLD, &myrank); + + StageProfileStore &store = stage_profile_store(); + int global_calls_sum[kStageProfileMaxLevels] = {}; + double global_metric_sum[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {}; + double global_metric_max[kStageProfileMaxLevels][STAGE_PROFILE_COUNT] = {}; + + MPI_Reduce(store.calls, global_calls_sum, kStageProfileMaxLevels, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(store.metric[0], global_metric_sum[0], + kStageProfileMaxLevels * STAGE_PROFILE_COUNT, + MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(store.metric[0], global_metric_max[0], + kStageProfileMaxLevels * STAGE_PROFILE_COUNT, + MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + + if (myrank != 0) + return; + + cout << endl; + cout << " GPU stage timing summary (sum/max over MPI ranks) " << endl; + cout << " lev calls"; + for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric) + cout << " " << setw(22) << stage_profile_metric_name(static_cast(metric)); + cout << endl; + + for (int lev = 0; lev < kStageProfileMaxLevels; ++lev) + { + if (global_calls_sum[lev] == 0) + continue; + + cout << setw(4) << lev << " " << setw(5) << global_calls_sum[lev]; + for (int metric = 0; metric < STAGE_PROFILE_COUNT; ++metric) + { + cout << " " + << setw(10) << setprecision(6) << fixed << global_metric_sum[lev][metric] + << "/" + << setw(10) << setprecision(6) << fixed << global_metric_max[lev][metric]; + } + cout << endl; + } + cout << endl; +} + void bssn_class::Step_MainPath_GPU(int lev, int YN) { #ifdef WithShell #error "Step_MainPath_GPU currently supports Patch grids only." #endif + const bool profile_enabled = stage_profile_enabled(); + const double step_total_begin = profile_enabled ? MPI_Wtime() : 0.0; + if (profile_enabled) + stage_profile_note_call(lev); + if (bssn_gpu_bind_process_device(myrank)) { cerr << "GPU device bind failure on MPI rank " << myrank << endl; MPI_Abort(MPI_COMM_WORLD, 1); } - bssn_gpu_clear_cached_device_buffers(); + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + bssn_gpu_clear_cached_device_buffers(); + stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0); + } + else + bssn_gpu_clear_cached_device_buffers(); setpbh(BH_num, Porg0, Mass, BH_num_input); @@ -62,6 +218,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) int iter_count = 0; int pre = 0, cor = 1; int ERROR = 0; + const bool keep_stage_sync_on_device = (RPS == 1) && (MAPBH == 1) && (REGLEV == 0); auto run_stage_on_block = [&](Block *cg, Patch *patch, MyList *state0_list, @@ -71,9 +228,27 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) MyList *varlb = boundary_src_list; MyList *varls = stage_data_list; MyList *varlr = rhs_list; + std::vector batch_state0; + std::vector batch_stage; + std::vector batch_rhs; while (varl0) { + const bool force_host_boundary_fix = false; + const bool can_batch_device_path = (lev > 0) && !force_host_boundary_fix; + if (can_batch_device_path) + { + batch_state0.push_back(cg->fgfs[varl0->data->sgfn]); + batch_stage.push_back(cg->fgfs[varls->data->sgfn]); + batch_rhs.push_back(cg->fgfs[varlr->data->sgfn]); + varl0 = varl0->next; + varlb = varlb->next; + varls = varls->next; + varlr = varlr->next; + continue; + } + + const double var_begin = profile_enabled ? MPI_Wtime() : 0.0; if (bssn_cuda_rk4_boundary_var(cg->shape, dT_lev, cg->X[0], cg->X[1], cg->X[2], patch->bbox[0], patch->bbox[1], patch->bbox[2], @@ -86,7 +261,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) cg->fgfs[varlr->data->sgfn], varl0->data->propspeed, varl0->data->SoA, - Symmetry, lev, rk_stage, false)) + Symmetry, lev, rk_stage, + force_host_boundary_fix, false)) { cerr << "GPU rk4/boundary failure: lev=" << lev << " rk_stage=" << rk_stage @@ -97,18 +273,59 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) ERROR = 1; break; } + if (profile_enabled) + { + stage_profile_add(lev, + force_host_boundary_fix ? STAGE_PROFILE_RUN_STAGE_HOST_FIX + : STAGE_PROFILE_RUN_STAGE_DEVICE, + MPI_Wtime() - var_begin); + } varl0 = varl0->next; varlb = varlb->next; varls = varls->next; varlr = varlr->next; } + + if (!ERROR && !batch_state0.empty()) + { + const double batch_begin = profile_enabled ? MPI_Wtime() : 0.0; + if (bssn_cuda_rk4_boundary_batch(cg->shape, dT_lev, + cg->X[0], cg->X[1], cg->X[2], + patch->bbox[0], patch->bbox[1], patch->bbox[2], + patch->bbox[3], patch->bbox[4], patch->bbox[5], + Symmetry, + &batch_state0[0], + &batch_stage[0], + &batch_rhs[0], + static_cast(batch_state0.size()), + rk_stage, false)) + { + cerr << "GPU rk4/boundary batch failure: lev=" << lev + << " rk_stage=" << rk_stage + << " vars=" << batch_state0.size() + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + else if (profile_enabled) + { + stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE_DEVICE, MPI_Wtime() - batch_begin); + } + } }; auto stage_download_var_list = - [&](Block *cg, MyList *var_list) { + [&](Block *cg, MyList *var_list, bool skip_unmapped) { while (var_list) { - if (bssn_cuda_download_buffer(cg->shape, cg->fgfs[var_list->data->sgfn])) + double *host_ptr = cg->fgfs[var_list->data->sgfn]; + if (skip_unmapped && !bssn_gpu_find_device_buffer(host_ptr)) + { + var_list = var_list->next; + continue; + } + if (bssn_cuda_download_buffer(cg->shape, host_ptr)) { cerr << "GPU stage download failure: lev=" << lev << " var=" << var_list->data->name @@ -123,7 +340,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) }; auto stage_download_patch_list = - [&](MyList *var_list) { + [&](MyList *var_list, bool skip_unmapped) { MyList *patch_it = GH->PatL[lev]; while (patch_it) { @@ -132,7 +349,7 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) { Block *cg = block_it->data; if (myrank == cg->rank) - stage_download_var_list(cg, var_list); + stage_download_var_list(cg, var_list, skip_unmapped); if (block_it == patch_it->data->ble) break; @@ -341,11 +558,22 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { + double t0 = 0.0; + if (profile_enabled) + t0 = MPI_Wtime(); if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME)) ERROR = 1; + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0); + if (profile_enabled) + t0 = MPI_Wtime(); run_stage_on_block(cg, Pp->data, StateList, StateList, SynchList_pre, RHSList, iter_count); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0); + if (profile_enabled) + t0 = MPI_Wtime(); if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi->sgfn], chitiny, false)) { cerr << "GPU lowerbound failure: lev=" << lev @@ -356,6 +584,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0); } if (BP == Pp->data->ble) break; @@ -366,9 +596,23 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) if (!ERROR) { - stage_download_patch_list(SynchList_pre); - if (!ERROR) - bssn_gpu_clear_cached_device_buffers(); + if (!keep_stage_sync_on_device) + { + double t0 = 0.0; + if (profile_enabled) + t0 = MPI_Wtime(); + stage_download_patch_list(SynchList_pre, false); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0); + if (!ERROR) + { + if (profile_enabled) + t0 = MPI_Wtime(); + bssn_gpu_clear_cached_device_buffers(); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0); + } + } } MPI_Request err_req_pre; @@ -378,10 +622,35 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) } Parallel::AsyncSyncState async_pre; - Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); - Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, true); - if (!ERROR) - refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]); + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0); + } + else + Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, + !keep_stage_sync_on_device); + stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0); + } + else + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, + !keep_stage_sync_on_device); + if (!ERROR && !keep_stage_sync_on_device) + { + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]); + stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0); + } + else + refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]); + } MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); if (ERROR) @@ -433,12 +702,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { + double t0 = 0.0; + if (profile_enabled) + t0 = MPI_Wtime(); ensure_stage_device_var_list(cg, SynchList_pre); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_ENSURE, MPI_Wtime() - t0); + + if (profile_enabled) + t0 = MPI_Wtime(); if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN)) ERROR = 1; + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_RHS, MPI_Wtime() - t0); + if (profile_enabled) + t0 = MPI_Wtime(); run_stage_on_block(cg, Pp->data, StateList, SynchList_pre, SynchList_cor, RHSList, iter_count); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_RUN_STAGE, MPI_Wtime() - t0); + if (profile_enabled) + t0 = MPI_Wtime(); if (bssn_cuda_lowerbound(cg->shape, cg->fgfs[phi1->sgfn], chitiny, false)) { cerr << "GPU lowerbound failure: lev=" << lev @@ -449,6 +734,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_LOWERBOUND, MPI_Wtime() - t0); } if (BP == Pp->data->ble) @@ -460,9 +747,23 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) if (!ERROR) { - stage_download_patch_list(SynchList_cor); - if (!ERROR) - bssn_gpu_clear_cached_device_buffers(); + if (!keep_stage_sync_on_device) + { + double t0 = 0.0; + if (profile_enabled) + t0 = MPI_Wtime(); + stage_download_patch_list(SynchList_cor, false); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0); + if (!ERROR) + { + if (profile_enabled) + t0 = MPI_Wtime(); + bssn_gpu_clear_cached_device_buffers(); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0); + } + } } MPI_Request err_req_cor; @@ -472,10 +773,35 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) } Parallel::AsyncSyncState async_cor; - Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); - Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, true); - if (!ERROR && iter_count < 3) - refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]); + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); + stage_profile_add(lev, STAGE_PROFILE_SYNC_START, MPI_Wtime() - t0); + } + else + Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, + !keep_stage_sync_on_device); + stage_profile_add(lev, STAGE_PROFILE_SYNC_FINISH, MPI_Wtime() - t0); + } + else + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, + !keep_stage_sync_on_device); + if (!ERROR && !keep_stage_sync_on_device && iter_count < 3) + { + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]); + stage_profile_add(lev, STAGE_PROFILE_REFRESH, MPI_Wtime() - t0); + } + else + refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]); + } MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE); if (ERROR) @@ -545,8 +871,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) RestrictProlong(lev, YN, BB); #endif - bssn_gpu_clear_cached_device_buffers(); - Pp = GH->PatL[lev]; while (Pp) { @@ -563,6 +887,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Pp = Pp->next; } + if (!ERROR && keep_stage_sync_on_device) + { + MyList *final_host_lists[] = {StateList, OldStateList, SynchList_cor, SynchList_pre}; + const int final_host_list_count = sizeof(final_host_lists) / sizeof(final_host_lists[0]); + for (int list_i = 0; list_i < final_host_list_count && !ERROR; ++list_i) + { + const double t0 = profile_enabled ? MPI_Wtime() : 0.0; + stage_download_patch_list(final_host_lists[list_i], true); + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_DOWNLOAD, MPI_Wtime() - t0); + } + } + + if (profile_enabled) + { + const double t0 = MPI_Wtime(); + bssn_gpu_clear_cached_device_buffers(); + stage_profile_add(lev, STAGE_PROFILE_CLEAR_CACHE, MPI_Wtime() - t0); + } + else + bssn_gpu_clear_cached_device_buffers(); + if (BH_num > 0 && lev == GH->levels - 1) { for (int ithBH = 0; ithBH < BH_num; ithBH++) @@ -572,6 +918,9 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Porg0[ithBH][2] = Porg1[ithBH][2]; } } + + if (profile_enabled) + stage_profile_add(lev, STAGE_PROFILE_TOTAL, MPI_Wtime() - step_total_begin); } #endif