diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 555acdd..9cabec9 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -546,6 +546,98 @@ bool cuda_direct_unpack_segment(double *buffer, return ok; } +bool cuda_direct_pack_bssn_prefix_to_host(double *buffer, + const Parallel::gridseg *src, + const Parallel::gridseg *dst, + int type, + MyList *VarLists, + int Symmetry) +{ +#if USE_CUDA_BSSN + if (!buffer || !src || !dst || !src->Bg || !dst->Bg || !VarLists) + return false; + if (!cuda_can_direct_pack(src, dst, type, VarLists)) + return false; + double *views[BSSN_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + if (!cuda_build_bssn_host_views(src->Bg, VarLists, BSSN_CUDA_STATE_COUNT, views) || + !cuda_build_state_soa(VarLists, BSSN_CUDA_STATE_COUNT, soa_flat)) + return false; + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + bool ok = false; + if (type == 1) + { + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + ok = bssn_cuda_pack_state_batch_to_host_buffer_for_host_views( + src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + } + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) + return false; + ok = bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views( + src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + soa_flat) == 0; + } + else if (type == 3) + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) + return false; + ok = bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views( + src->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + soa_flat) == 0; + } + if (sync_profile_enabled()) + sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; + (void)Symmetry; + return ok; +#else + (void)buffer; (void)src; (void)dst; (void)type; (void)VarLists; (void)Symmetry; + return false; +#endif +} + +bool cuda_direct_unpack_bssn_prefix_from_host(double *buffer, + const Parallel::gridseg *dst, + int type, + MyList *VarListd) +{ +#if USE_CUDA_BSSN + if (!buffer || !dst || !dst->Bg || !VarListd) + return false; + if (!cuda_can_direct_unpack(dst, type, VarListd)) + return false; + double *views[BSSN_CUDA_STATE_COUNT]; + if (!cuda_build_bssn_host_views(dst->Bg, VarListd, BSSN_CUDA_STATE_COUNT, views)) + return false; + const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; + const int i0 = cuda_seg_begin(dst, dst->Bg, 0); + const int j0 = cuda_seg_begin(dst, dst->Bg, 1); + const int k0 = cuda_seg_begin(dst, dst->Bg, 2); + const bool ok = bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views( + dst->Bg, views, BSSN_CUDA_STATE_COUNT, buffer, dst->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + if (sync_profile_enabled()) + sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; + return ok; +#else + (void)buffer; (void)dst; (void)type; (void)VarListd; + return false; +#endif +} + bool cuda_aware_mpi_enabled() { static int enabled = -1; @@ -5276,6 +5368,7 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type, VarLists)) @@ -5306,6 +5399,28 @@ int Parallel::data_packer(double *data, MyList *src, MyList

BSSN_CUDA_STATE_COUNT && + dir == PACK && + cuda_direct_pack_bssn_prefix_to_host(data + size_out, src->data, dst->data, + type, VarLists, Symmetry)) + { + handled_by_cuda = true; + cuda_handled_count = BSSN_CUDA_STATE_COUNT; + } + else if (!s_cuda_aware_pack_active && + state_idx == 0 && + state_count > BSSN_CUDA_STATE_COUNT && + dir == UNPACK && + cuda_direct_unpack_bssn_prefix_from_host(data + size_out, dst->data, + type, VarListd)) + { + handled_by_cuda = true; + cuda_handled_count = BSSN_CUDA_STATE_COUNT; + } +#endif if (!handled_by_cuda) { #if USE_CUDA_BSSN || USE_CUDA_Z4C @@ -5408,8 +5523,8 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; - while (varls->next && varld->next) + size_out += (cuda_handled_count - 1) * dst->data->shape[0] * dst->data->shape[1] * dst->data->shape[2]; + for (int skipped = 1; skipped < cuda_handled_count && varls->next && varld->next; ++skipped) { varls = varls->next; varld = varld->next; @@ -7270,14 +7385,15 @@ void Parallel::prepare_inter_time_level(Patch *Pat, if (myrank == cg->rank) { #if USE_CUDA_BSSN + bool bssn_prefix_done = false; double *src1_views[BSSN_CUDA_STATE_COUNT]; double *src2_views[BSSN_CUDA_STATE_COUNT]; double *dst_views[BSSN_CUDA_STATE_COUNT]; const int state_count = cuda_state_var_count(VarList1, VarList2); - if (state_count == BSSN_CUDA_STATE_COUNT && - cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && - cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && - cuda_build_bssn_host_views(cg, VarList3, state_count, dst_views) && + if (state_count >= BSSN_CUDA_STATE_COUNT && + cuda_build_bssn_host_views(cg, VarList1, BSSN_CUDA_STATE_COUNT, src1_views) && + cuda_build_bssn_host_views(cg, VarList2, BSSN_CUDA_STATE_COUNT, src2_views) && + cuda_build_bssn_host_views(cg, VarList3, BSSN_CUDA_STATE_COUNT, dst_views) && bssn_cuda_has_resident_state(cg) && bssn_cuda_resident_state_matches(cg, src1_views) && bssn_cuda_resident_state_matches(cg, src2_views) && @@ -7285,15 +7401,30 @@ void Parallel::prepare_inter_time_level(Patch *Pat, src1_views, src2_views, 0, dst_views, 2, tindex) == 0) { - if (BP == Pat->ble) - break; - BP = BP->next; - continue; + if (state_count == BSSN_CUDA_STATE_COUNT) + { + if (BP == Pat->ble) + break; + BP = BP->next; + continue; + } + bssn_prefix_done = true; } #endif varl1 = VarList1; varl2 = VarList2; varl3 = VarList3; +#if USE_CUDA_BSSN + if (bssn_prefix_done) + { + for (int i = 0; i < BSSN_CUDA_STATE_COUNT && varl1 && varl2 && varl3; ++i) + { + varl1 = varl1->next; + varl2 = varl2->next; + varl3 = varl3->next; + } + } +#endif while (varl1) { if (tindex == 0) @@ -7347,16 +7478,17 @@ void Parallel::prepare_inter_time_level(Patch *Pat, if (myrank == cg->rank) { #if USE_CUDA_BSSN + bool bssn_prefix_done = false; double *src1_views[BSSN_CUDA_STATE_COUNT]; double *src2_views[BSSN_CUDA_STATE_COUNT]; double *src3_views[BSSN_CUDA_STATE_COUNT]; double *dst_views[BSSN_CUDA_STATE_COUNT]; const int state_count = cuda_state_var_count(VarList1, VarList2); - if (state_count == BSSN_CUDA_STATE_COUNT && - cuda_build_bssn_host_views(cg, VarList1, state_count, src1_views) && - cuda_build_bssn_host_views(cg, VarList2, state_count, src2_views) && - cuda_build_bssn_host_views(cg, VarList3, state_count, src3_views) && - cuda_build_bssn_host_views(cg, VarList4, state_count, dst_views) && + if (state_count >= BSSN_CUDA_STATE_COUNT && + cuda_build_bssn_host_views(cg, VarList1, BSSN_CUDA_STATE_COUNT, src1_views) && + cuda_build_bssn_host_views(cg, VarList2, BSSN_CUDA_STATE_COUNT, src2_views) && + cuda_build_bssn_host_views(cg, VarList3, BSSN_CUDA_STATE_COUNT, src3_views) && + cuda_build_bssn_host_views(cg, VarList4, BSSN_CUDA_STATE_COUNT, dst_views) && bssn_cuda_has_resident_state(cg) && bssn_cuda_resident_state_matches(cg, src1_views) && bssn_cuda_resident_state_matches(cg, src2_views) && @@ -7365,16 +7497,32 @@ void Parallel::prepare_inter_time_level(Patch *Pat, src1_views, src2_views, src3_views, dst_views, 3, tindex) == 0) { - if (BP == Pat->ble) - break; - BP = BP->next; - continue; + if (state_count == BSSN_CUDA_STATE_COUNT) + { + if (BP == Pat->ble) + break; + BP = BP->next; + continue; + } + bssn_prefix_done = true; } #endif varl1 = VarList1; varl2 = VarList2; varl3 = VarList3; varl4 = VarList4; +#if USE_CUDA_BSSN + if (bssn_prefix_done) + { + for (int i = 0; i < BSSN_CUDA_STATE_COUNT && varl1 && varl2 && varl3 && varl4; ++i) + { + varl1 = varl1->next; + varl2 = varl2->next; + varl3 = varl3->next; + varl4 = varl4->next; + } + } +#endif while (varl1) { if (tindex == 0) diff --git a/AMSS_NCKU_source/bssnEScalar_class.C b/AMSS_NCKU_source/bssnEScalar_class.C index 927787a..96cd5fe 100644 --- a/AMSS_NCKU_source/bssnEScalar_class.C +++ b/AMSS_NCKU_source/bssnEScalar_class.C @@ -1755,8 +1755,14 @@ void bssnEScalar_class::Step(int lev, int YN) #if (RPS == 0) // mesh refinement boundary part #if USE_CUDA_BSSN - if (!getenv("AMSS_ESCALAR_SPLIT_RP") || atoi(getenv("AMSS_ESCALAR_SPLIT_RP")) == 0) - download_bssn_cuda_prefix_if_present(GH->PatL[lev], SynchList_cor, myrank); + { + const char *mixed_env = getenv("AMSS_ESCALAR_MIXED_GPU_RP"); + const bool mixed_gpu_rp = (!mixed_env || atoi(mixed_env) != 0); + const char *split_env = getenv("AMSS_ESCALAR_SPLIT_RP"); + const bool split_rp = (split_env && atoi(split_env) != 0); + if (!mixed_gpu_rp && !split_rp) + download_bssn_cuda_prefix_if_present(GH->PatL[lev], SynchList_cor, myrank); + } #endif RestrictProlong(lev, YN, BB); diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 8b1860d..b891ad4 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -102,6 +102,17 @@ int amss_escalar_split_rp_recursive_enabled() return enabled; } +int amss_escalar_mixed_gpu_rp_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_ESCALAR_MIXED_GPU_RP"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled; +} + MyList *clone_var_sublist(MyList *src, int skip, int take) { for (int i = 0; i < skip && src; ++i) @@ -7197,7 +7208,8 @@ void bssn_class::RestrictProlong(int lev, int YN, bool BB, STEP_TIMER_ADD(TB_RESTRICT_PROLONG, timer_restrict_prolong); return; } - if (lev > 0 && var_list_count(SL) > BSSN_CUDA_STATE_COUNT) + if (lev > 0 && !amss_escalar_mixed_gpu_rp_enabled() && + var_list_count(SL) > BSSN_CUDA_STATE_COUNT) { download_bssn_prefix_for_list(GH->PatL[lev], SL, myrank); download_bssn_prefix_for_list(GH->PatL[lev - 1], SL, myrank); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 61a793e..c3ef27d 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -7565,6 +7565,78 @@ int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag return 0; } +extern "C" +int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, false); + if (bank < 0 || !ctx.resident_valid[bank]) return 1; + const int region_all = sx * sy * sz; + const size_t total_doubles = (size_t)state_count * (size_t)region_all; + double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_restrict_state_region_batch<<>>( + ctx.d_resident_mem[bank], d_comm, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, state_count, + ex[0] * ex[1] * ex[2]); + CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, + total_doubles * sizeof(double), + cudaMemcpyDeviceToHost)); + return 0; +} + +extern "C" +int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!host_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, false); + if (bank < 0 || !ctx.resident_valid[bank]) return 1; + const int region_all = sx * sy * sz; + const size_t total_doubles = (size_t)state_count * (size_t)region_all; + double *d_comm = ensure_step_comm_buffer(ctx, total_doubles); + upload_comm_state_soa(state_soa, state_count); + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_prolong_state_region_batch<<>>( + ctx.d_resident_mem[bank], d_comm, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, + total_doubles * sizeof(double), + cudaMemcpyDeviceToHost)); + return 0; +} + static void copy_state_device_batch(void *block_tag, int state_count, double *device_buffer, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 63f5f59..db903fa 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -179,8 +179,27 @@ int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag int i0, int j0, int k0, int sx, int sy, int sz); +int bssn_cuda_restrict_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + const double *state_soa); + +int bssn_cuda_prolong_state_batch_to_host_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *host_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa); + int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, - int state_count, + int state_count, double *device_buffer, int *ex, int i0, int j0, int k0,