From d9287ea530df4e98d9354b4485e859b96c757823 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sun, 12 Apr 2026 12:13:47 +0800 Subject: [PATCH] Fix GPU RK4 boundary and sync correctness --- AMSS_NCKU_source/MPatch.C | 30 +++++++++++-- AMSS_NCKU_source/bssn_cuda_ops.cu | 70 +++++++++++++++++++++++++++---- AMSS_NCKU_source/bssn_cuda_ops.h | 2 + AMSS_NCKU_source/bssn_cuda_step.C | 62 ++++++++++++++++++--------- 4 files changed, 134 insertions(+), 30 deletions(-) diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 29680d8..7f7242a 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -83,6 +83,9 @@ struct CachedInterpPlanEntry { bool valid; InterpPlanKey key; + vector xvals; + vector yvals; + vector zvals; CachedInterpPlan plan; CachedInterpPlanEntry() : valid(false) {} @@ -286,14 +289,28 @@ CachedInterpPlanEntry &interp_plan_cache_entry() bool same_interp_plan_key(const InterpPlanKey &lhs, const InterpPlanKey &rhs) { return lhs.patch == rhs.patch && - lhs.x == rhs.x && - lhs.y == rhs.y && - lhs.z == rhs.z && lhs.NN == rhs.NN && lhs.Symmetry == rhs.Symmetry && lhs.myrank == rhs.myrank; } +bool same_interp_plan_points(const CachedInterpPlanEntry &cache, int NN, double **XX) +{ + if (static_cast(cache.xvals.size()) != NN || + static_cast(cache.yvals.size()) != NN || + static_cast(cache.zvals.size()) != NN) + return false; + + for (int j = 0; j < NN; ++j) + { + if (cache.xvals[j] != XX[0][j] || + cache.yvals[j] != XX[1][j] || + cache.zvals[j] != XX[2][j]) + return false; + } + return true; +} + CachedInterpPlan &get_cached_interp_plan(Patch *patch, int NN, double **XX, int Symmetry, int myrank, @@ -314,11 +331,15 @@ CachedInterpPlan &get_cached_interp_plan(Patch *patch, CachedInterpPlanEntry &cache = interp_plan_cache_entry(); if (cache.valid && same_interp_plan_key(cache.key, key) && + same_interp_plan_points(cache, NN, XX) && cache.plan.nblocks == static_cast(block_index.views.size())) return cache.plan; cache.valid = true; cache.key = key; + cache.xvals.assign(XX[0], XX[0] + NN); + cache.yvals.assign(XX[1], XX[1] + NN); + cache.zvals.assign(XX[2], XX[2] + NN); cache.plan = CachedInterpPlan(); CachedInterpPlan &plan = cache.plan; plan.nblocks = static_cast(block_index.views.size()); @@ -412,6 +433,9 @@ void release_interp_plan_cache_internal() { CachedInterpPlanEntry &cache = interp_plan_cache_entry(); cache.valid = false; + cache.xvals.clear(); + cache.yvals.clear(); + cache.zvals.clear(); cache.plan = CachedInterpPlan(); } diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index fa4e9cf..f61ddf4 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -1,5 +1,7 @@ #include "bssn_cuda_ops.h" #include "bssn_gpu.h" +#include "rungekutta4_rout.h" +#include "sommerfeld_rout.h" #include #include @@ -262,6 +264,23 @@ inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *sr return true; } +inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label) +{ + const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr); + if (!device_ptr) + return true; + + bssn_gpu_prepare_host_buffer(host_ptr, count); + const size_t bytes = static_cast(count) * sizeof(double); + cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error(label, err); + return false; + } + return true; +} + inline bool copy_region_to_padded_stage(CachedBuffer &dst, const double *src, const int src_shape[3], @@ -1003,6 +1022,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, double xmin, double ymin, double zmin, double xmax, double ymax, double zmax, const double *state0, + const double *phi_field, + const double *lap_field, const double *boundary_src, double *stage_data, double *rhs_accum, @@ -1145,7 +1166,7 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args); } - if (ok) + if (ok && lev == 0) { double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr; void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; @@ -1154,12 +1175,47 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (ok && lev > 0) { - double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr; - void *args[] = {&nx, &ny, &nz, - &has_xmin, &has_ymin, &has_zmin, - &has_xmax, &has_ymax, &has_zmax, - &d_state0, &d_stage}; - ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args); + double *host_state0 = const_cast(state0); + double *host_phi = const_cast(phi_field); + double *host_lap = const_cast(lap_field); + double *host_rhs = rhs_accum; + + ok = sync_host_from_mapped_device(host_state0, n, "cudaMemcpy(D2H) state0") && + sync_host_from_mapped_device(host_phi, n, "cudaMemcpy(D2H) phi_field") && + sync_host_from_mapped_device(host_lap, n, "cudaMemcpy(D2H) lap_field") && + sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum"); + if (ok) + { + bssn_gpu_prepare_host_buffer(stage_data, n); + cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(D2H) stage_data", err); + ok = false; + } + } + + if (ok) + { + int rk_stage_host = rk_stage; + int cor = 1; + f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host); + f_sommerfeld_rout(ex, + const_cast(X), const_cast(Y), const_cast(Z), + xmin, ymin, zmin, xmax, ymax, zmax, + dT, + host_phi, host_lap, + host_state0, stage_data, + const_cast(SoA), + symmetry, cor); + + cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice); + if (err != cudaSuccess) + { + report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err); + ok = false; + } + } } if (ok) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.h b/AMSS_NCKU_source/bssn_cuda_ops.h index ac2dfab..5d86f18 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.h +++ b/AMSS_NCKU_source/bssn_cuda_ops.h @@ -12,6 +12,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, double xmin, double ymin, double zmin, double xmax, double ymax, double zmax, const double *state0, + const double *phi_field, + const double *lap_field, const double *boundary_src, double *stage_data, double *rhs_accum, diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index c5ea4c6..6ee1fec 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -2,14 +2,15 @@ #ifdef USE_GPU +#include #include +#include #include #include "bssn_class.h" #include "bssn_cuda_ops.h" #include "bssn_gpu.h" #include "bssn_macro.h" -#include "rungekutta4_rout.h" void bssn_class::Step_MainPath_GPU(int lev, int YN) { @@ -56,11 +57,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) const bool BB = fgt(PhysTime, StartTime, dT_lev / 2); (void)BB; -#if (MAPBH == 0) - const bool need_host_stage_sync = (BH_num > 0 && lev == GH->levels - 1); -#else - const bool need_host_stage_sync = false; -#endif double ndeps = (lev < GH->movls) ? numepsb : numepss; double TRK4 = PhysTime; int iter_count = 0; @@ -83,6 +79,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) patch->bbox[0], patch->bbox[1], patch->bbox[2], patch->bbox[3], patch->bbox[4], patch->bbox[5], cg->fgfs[varl0->data->sgfn], + cg->fgfs[phi0->sgfn], + cg->fgfs[Lap0->sgfn], cg->fgfs[varlb->data->sgfn], cg->fgfs[varls->data->sgfn], cg->fgfs[varlr->data->sgfn], @@ -124,6 +122,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) } }; + auto stage_download_patch_list = + [&](MyList *var_list) { + MyList *patch_it = GH->PatL[lev]; + while (patch_it) + { + MyList *block_it = patch_it->data->blb; + while (block_it) + { + Block *cg = block_it->data; + if (myrank == cg->rank) + stage_download_var_list(cg, var_list); + + if (block_it == patch_it->data->ble) + break; + block_it = block_it->next; + } + if (ERROR) + break; + patch_it = patch_it->next; + } + }; + auto ensure_stage_device_var_list = [&](Block *cg, MyList *var_list) { const int n = cg->shape[0] * cg->shape[1] * cg->shape[2]; @@ -336,8 +356,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } - if (!ERROR && !sync_cache_pre[lev].valid) - stage_download_var_list(cg, SynchList_pre); } if (BP == Pp->data->ble) break; @@ -346,8 +364,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Pp = Pp->next; } - if (!ERROR && sync_cache_pre[lev].valid && !can_pack_sync_from_device(SynchList_pre, sync_cache_pre[lev])) - refresh_stage_host_before_sync(SynchList_pre, sync_cache_pre[lev]); + if (!ERROR) + { + stage_download_patch_list(SynchList_pre); + if (!ERROR) + bssn_gpu_clear_cached_device_buffers(); + } MPI_Request err_req_pre; { @@ -357,8 +379,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_pre; Parallel::Sync_start(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev], async_pre); - Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, need_host_stage_sync); - if (!ERROR && need_host_stage_sync) + Parallel::Sync_finish(sync_cache_pre[lev], async_pre, SynchList_pre, Symmetry, true); + if (!ERROR) refresh_stage_device_after_sync(SynchList_pre, sync_cache_pre[lev]); MPI_Wait(&err_req_pre, MPI_STATUS_IGNORE); @@ -427,8 +449,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } - if (!ERROR && (!sync_cache_cor[lev].valid || iter_count == 3)) - stage_download_var_list(cg, SynchList_cor); } if (BP == Pp->data->ble) @@ -438,9 +458,12 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Pp = Pp->next; } - if (!ERROR && sync_cache_cor[lev].valid && iter_count < 3 && - !can_pack_sync_from_device(SynchList_cor, sync_cache_cor[lev])) - refresh_stage_host_before_sync(SynchList_cor, sync_cache_cor[lev]); + if (!ERROR) + { + stage_download_patch_list(SynchList_cor); + if (!ERROR) + bssn_gpu_clear_cached_device_buffers(); + } MPI_Request err_req_cor; { @@ -450,9 +473,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Parallel::AsyncSyncState async_cor; Parallel::Sync_start(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev], async_cor); - const bool unpack_cor_to_host = (iter_count == 3) || need_host_stage_sync; - Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, unpack_cor_to_host); - if (!ERROR && iter_count < 3 && unpack_cor_to_host) + Parallel::Sync_finish(sync_cache_cor[lev], async_cor, SynchList_cor, Symmetry, true); + if (!ERROR && iter_count < 3) refresh_stage_device_after_sync(SynchList_cor, sync_cache_cor[lev]); MPI_Wait(&err_req_cor, MPI_STATUS_IGNORE);