From 18e9c9cc507ad9db038a78ea6a923320d638889a Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 30 Apr 2026 10:58:15 +0800 Subject: [PATCH] Optimize BSSN CUDA resident AMR prolong path --- AMSS_NCKU_source/Parallel.C | 343 +++++++++++++++++---- AMSS_NCKU_source/bssn_rhs_cuda.cu | 475 +++++++++++++++++++++++++++++- AMSS_NCKU_source/bssn_rhs_cuda.h | 38 ++- 3 files changed, 778 insertions(+), 78 deletions(-) diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index ead3566..2a193a7 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -190,6 +190,25 @@ bool cuda_build_bssn_host_views(Block *block, } return v == 0; } + +bool cuda_build_bssn_soa(MyList *vars, + int state_count, + double *soa_flat) +{ + if (!vars || !soa_flat || state_count != BSSN_CUDA_STATE_COUNT) + return false; + MyList *v = vars; + for (int i = 0; i < BSSN_CUDA_STATE_COUNT; ++i) + { + if (!v) + return false; + soa_flat[3 * i + 0] = v->data->SoA[0]; + soa_flat[3 * i + 1] = v->data->SoA[1]; + soa_flat[3 * i + 2] = v->data->SoA[2]; + v = v->next; + } + return v == 0; +} #endif #if USE_CUDA_BSSN || USE_CUDA_Z4C @@ -198,6 +217,9 @@ int fortran_idint(double x) return (int)x; } +bool cuda_amr_restrict_device_enabled(); +bool cuda_amr_prolong_device_enabled(); + bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const Parallel::gridseg *dst, int first_fine[3]) @@ -226,7 +248,7 @@ bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, const int lbc = fortran_idint((llbc - base) / CD + 0.4) + 1; const int lbf = fortran_idint((llbf - base) / FD + 0.4) + 1; first_fine[d] = 2 * lbc - lbf - 1; - if (first_fine[d] - 2 < 0) + if (first_fine[d] < 0) return false; if (first_fine[d] + 2 * (dst->shape[d] - 1) + 3 >= src->Bg->shape[d]) return false; @@ -271,7 +293,7 @@ bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, const int first_coarse = first_fine_ii[d] / 2 - coarse_lb[d]; const int last_fine_ii = first_fine_ii[d] + dst->shape[d] - 1; const int last_coarse = last_fine_ii / 2 - coarse_lb[d]; - if (first_coarse - 2 < 0) + if (first_coarse < -1) return false; if (last_coarse + 3 >= src->Bg->shape[d]) return false; @@ -306,13 +328,21 @@ bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg #elif USE_CUDA_BSSN if (bssn_cuda_has_resident_state(src->Bg) == 0) return false; - if (type == 1) - return true; - int a[3], b[3]; - if (type == 2) - return cuda_cell_gw3_restrict_params(src, dst, a); - if (type == 3) - return cuda_cell_gw3_prolong_params(src, dst, a, b); + if (type == 1) + return true; + int a[3], b[3]; + if (type == 2) + { + if (!cuda_amr_restrict_device_enabled()) + return false; + return cuda_cell_gw3_restrict_params(src, dst, a); + } + if (type == 3) + { + if (!cuda_amr_prolong_device_enabled()) + return false; + return cuda_cell_gw3_prolong_params(src, dst, a, b); + } return false; #else (void)type; @@ -427,6 +457,28 @@ bool cuda_aware_mpi_enabled() return enabled != 0; } +bool cuda_amr_restrict_device_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_RESTRICT_DEVICE"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +bool cuda_amr_prolong_device_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_AMR_PROLONG_DEVICE"); + enabled = (!env || atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + bool cuda_mpi_diag_enabled() { static int enabled = -1; @@ -438,6 +490,17 @@ bool cuda_mpi_diag_enabled() return enabled != 0 || sync_profile_enabled(); } +int cuda_mpi_diag_limit() +{ + static int limit = -1; + if (limit < 0) + { + const char *env = getenv("AMSS_CUDA_MPI_DIAG_LIMIT"); + limit = (env && atoi(env) > 0) ? atoi(env) : 10; + } + return limit; +} + double *alloc_device_comm_buffer(int length) { if (length <= 0) @@ -486,9 +549,11 @@ bool cuda_direct_pack_segment_to_device(double *buffer, if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; - bool ok = false; - double *views[BSSN_CUDA_STATE_COUNT]; - const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); + bool ok = false; + double *views[BSSN_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); + const bool have_soa = cuda_build_bssn_soa(VarLists, state_count, soa_flat); if (type == 1) { const int i0 = cuda_seg_begin(dst, src->Bg, 0); @@ -509,14 +574,15 @@ bool cuda_direct_pack_segment_to_device(double *buffer, int first_fine[3]; if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) return false; - ok = have_views - ? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views( - src->Bg, views, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine[0], first_fine[1], first_fine[2]) == 0 - : bssn_cuda_restrict_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], + ok = have_views + ? bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2], + have_soa ? soa_flat : 0) == 0 + : bssn_cuda_restrict_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], first_fine[0], first_fine[1], first_fine[2]) == 0; } else if (type == 3) @@ -524,13 +590,14 @@ bool cuda_direct_pack_segment_to_device(double *buffer, int first_fine_ii[3], coarse_lb[3]; if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) return false; - ok = have_views - ? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views( - src->Bg, views, state_count, buffer, src->Bg->shape, - dst->shape[0], dst->shape[1], dst->shape[2], - first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], - coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0 - : bssn_cuda_prolong_state_batch_to_device_buffer( + ok = have_views + ? bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views( + src->Bg, views, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2], + have_soa ? soa_flat : 0) == 0 + : bssn_cuda_prolong_state_batch_to_device_buffer( src->Bg, state_count, buffer, src->Bg->shape, dst->shape[0], dst->shape[1], dst->shape[2], first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], @@ -643,19 +710,39 @@ bool cuda_flush_device_segment_batch(Block *block, int state_count, const std::vector &meta, int dir, + int type, MyList *vars) { if (!block || meta.empty()) return true; - const int segment_count = (int)(meta.size() / 8); - double *views[BSSN_CUDA_STATE_COUNT]; - const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); - if (dir == PACK) + const int stride = (dir == PACK && type == 3) ? 11 : 8; + const int segment_count = (int)(meta.size() / stride); + double *views[BSSN_CUDA_STATE_COUNT]; + double soa_flat[3 * BSSN_CUDA_STATE_COUNT]; + const bool have_views = cuda_build_bssn_host_views(block, vars, state_count, views); + const bool have_soa = cuda_build_bssn_soa(vars, state_count, soa_flat); + if (dir == PACK) + { + if (type == 2) + return have_views + ? bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0 + : bssn_cuda_restrict_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; + if (type == 3) + return have_views + ? bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views( + block, views, state_count, data, block->shape, segment_count, + meta.data(), have_soa ? soa_flat : 0) == 0 + : bssn_cuda_prolong_state_segments_to_device_buffer( + block, state_count, data, block->shape, segment_count, meta.data()) == 0; return have_views ? bssn_cuda_pack_state_segments_to_device_buffer_for_host_views( block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 : bssn_cuda_pack_state_segments_to_device_buffer( block, state_count, data, block->shape, segment_count, meta.data()) == 0; + } return have_views ? bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views( block, views, state_count, data, block->shape, segment_count, meta.data()) == 0 @@ -685,6 +772,7 @@ int cuda_data_packer_device_batched(double *data, int size_out = 0; Block *batch_block = 0; + int batch_type = 0; std::vector batch_meta; batch_meta.reserve(64); @@ -702,42 +790,72 @@ int cuda_data_packer_device_batched(double *data, type = 2; else type = 3; - if (type != 1) - return -1; Block *block = (dir == PACK) ? src->data->Bg : dst->data->Bg; if ((dir == PACK && !cuda_can_direct_pack(src->data, dst->data, type)) || (dir == UNPACK && !cuda_can_direct_unpack(dst->data, type))) return -1; - if (batch_block && batch_block != block) + if (batch_block && (batch_block != block || batch_type != type)) { MyList *batch_vars = (dir == PACK) ? VarLists : VarListd; - if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars)) + if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars)) return -1; batch_meta.clear(); } batch_block = block; + batch_type = type; - const int i0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 0) - : cuda_seg_begin(dst->data, block, 0); - const int j0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 1) - : cuda_seg_begin(dst->data, block, 1); - const int k0 = (dir == PACK) ? cuda_seg_begin(dst->data, block, 2) - : cuda_seg_begin(dst->data, block, 2); const int sx = dst->data->shape[0]; const int sy = dst->data->shape[1]; const int sz = dst->data->shape[2]; const int region_all = sx * sy * sz; - batch_meta.push_back(i0); - batch_meta.push_back(j0); - batch_meta.push_back(k0); - batch_meta.push_back(sx); - batch_meta.push_back(sy); - batch_meta.push_back(sz); - batch_meta.push_back(region_all); - batch_meta.push_back(size_out); + if (dir == UNPACK || type == 1) + { + const int i0 = cuda_seg_begin(dst->data, block, 0); + const int j0 = cuda_seg_begin(dst->data, block, 1); + const int k0 = cuda_seg_begin(dst->data, block, 2); + batch_meta.push_back(i0); + batch_meta.push_back(j0); + batch_meta.push_back(k0); + batch_meta.push_back(sx); + batch_meta.push_back(sy); + batch_meta.push_back(sz); + batch_meta.push_back(region_all); + batch_meta.push_back(size_out); + } + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine)) + return -1; + batch_meta.push_back(sx); + batch_meta.push_back(sy); + batch_meta.push_back(sz); + batch_meta.push_back(region_all); + batch_meta.push_back(size_out); + batch_meta.push_back(first_fine[0]); + batch_meta.push_back(first_fine[1]); + batch_meta.push_back(first_fine[2]); + } + else + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb)) + return -1; + batch_meta.push_back(sx); + batch_meta.push_back(sy); + batch_meta.push_back(sz); + batch_meta.push_back(region_all); + batch_meta.push_back(size_out); + batch_meta.push_back(first_fine_ii[0]); + batch_meta.push_back(first_fine_ii[1]); + batch_meta.push_back(first_fine_ii[2]); + batch_meta.push_back(coarse_lb[0]); + batch_meta.push_back(coarse_lb[1]); + batch_meta.push_back(coarse_lb[2]); + } size_out += state_count * region_all; } @@ -748,7 +866,7 @@ int cuda_data_packer_device_batched(double *data, if (batch_block) { MyList *batch_vars = (dir == PACK) ? VarLists : VarListd; - if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_vars)) + if (!cuda_flush_device_segment_batch(batch_block, data, state_count, batch_meta, dir, batch_type, batch_vars)) return -1; } return size_out; @@ -796,6 +914,89 @@ bool cuda_segments_device_eligible(MyList *src, return has_work; } +struct CudaEligibilityStats +{ + int active; + int type1; + int type2; + int type3; + int null_seg; + int no_resident; + int param_fail; + int unsupported_state; +}; + +void cuda_collect_eligibility_stats(MyList *src, + MyList *dst, + int rank_in, + int dir, + int myrank, + int state_count, + CudaEligibilityStats &stats) +{ + if (!cuda_device_state_count_supported(state_count)) + { + stats.unsupported_state++; + return; + } + while (src && dst) + { + const bool active = + (dir == PACK && dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank) || + (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank); + if (active) + { + stats.active++; + if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg) + { + stats.null_seg++; + src = src->next; + dst = dst->next; + continue; + } + int type; + if (src->data->Bg->lev == dst->data->Bg->lev) + type = 1; + else if (src->data->Bg->lev > dst->data->Bg->lev) + type = 2; + else + type = 3; + if (type == 1) stats.type1++; + else if (type == 2) stats.type2++; + else stats.type3++; + +#if USE_CUDA_BSSN + if (dir == PACK) + { + if (bssn_cuda_has_resident_state(src->data->Bg) == 0) + stats.no_resident++; + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src->data, dst->data, first_fine)) + stats.param_fail++; + } + else if (type == 3) + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src->data, dst->data, first_fine_ii, coarse_lb)) + stats.param_fail++; + } + } + else + { + if (bssn_cuda_has_resident_state(dst->data->Bg) == 0) + stats.no_resident++; + } +#else + (void)type; +#endif + } + src = src->next; + dst = dst->next; + } +} + bool cuda_pack_to_device_eligible(MyList *src, MyList *dst, int rank_in, @@ -5379,19 +5580,33 @@ void Parallel::transfer_cached(MyList **src, MyList *PatL, MyList *VarList, int Symmetr { static int diag_reported = 0; int rep = diag_reported; - if (myrank == 0 && rep < 20) + if (myrank == 0 && rep < cuda_mpi_diag_limit()) { if (__sync_bool_compare_and_swap(&diag_reported, rep, rep + 1)) fprintf(stderr, "[AMSS-CUDA-MPI][rank %d] Sync_start: device_sends=%d " diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 80e3918..29b4929 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -102,6 +102,14 @@ struct RhsStageProfileStats { double ms[RHS_STAGE_COUNT]; }; +struct CudaAuxProfileStats { + long long prepare_calls; + long long writeback_calls; + double prepare_ms; + double writeback_ms; + double writeback_gb; +}; + static CudaProfileStats &cuda_profile_stats() { static CudaProfileStats stats = { 0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, @@ -110,6 +118,45 @@ static CudaProfileStats &cuda_profile_stats() { return stats; } +static CudaAuxProfileStats &cuda_aux_profile_stats() { + static CudaAuxProfileStats stats = {}; + return stats; +} + +static bool cuda_aux_profile_enabled() { + static int enabled = -1; + if (enabled < 0) { + const char *env = getenv("AMSS_PROFILE_CUDA_AUX"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + +static int cuda_aux_profile_every() { + static int every = -1; + if (every < 0) { + const char *env = getenv("AMSS_PROFILE_CUDA_AUX_EVERY"); + every = (env && atoi(env) > 0) ? atoi(env) : 100; + } + return every; +} + +static void cuda_aux_profile_maybe_log() { + if (!cuda_aux_profile_enabled()) return; + CudaAuxProfileStats &stats = cuda_aux_profile_stats(); + const long long calls = stats.prepare_calls + stats.writeback_calls; + if (calls <= 0 || calls % cuda_aux_profile_every() != 0) return; + fprintf(stderr, + "[AMSS-CUDA-AUX][rank %d][dev %d] prepare=%lld avg_prepare=%.3f ms writebacks=%lld avg_writeback=%.3f ms writeback_GB=%.3f\n", + g_dispatch.my_rank, g_dispatch.my_device, + stats.prepare_calls, + stats.prepare_calls ? stats.prepare_ms / (double)stats.prepare_calls : 0.0, + stats.writeback_calls, + stats.writeback_calls ? stats.writeback_ms / (double)stats.writeback_calls : 0.0, + stats.writeback_gb); + fflush(stderr); +} + static RhsStageProfileStats &rhs_stage_profile_stats() { static RhsStageProfileStats stats = {}; return stats; @@ -480,6 +527,7 @@ static const int k_lk_rhs_slots[BSSN_LK_FIELD_COUNT] = { }; __constant__ int d_subset_state_indices[BSSN_STATE_COUNT]; +__constant__ double d_comm_state_soa[3 * BSSN_STATE_COUNT]; static const int k_lk_soa_signs[3 * BSSN_LK_FIELD_COUNT] = { 1, 1, 1, @@ -523,6 +571,7 @@ struct StepContext { std::array d_state_next; std::array, BSSN_RESIDENT_BANK_COUNT> d_resident; std::array, BSSN_RESIDENT_BANK_COUNT> resident_host; + std::array, BSSN_RESIDENT_BANK_COUNT> resident_host_clean; std::array resident_age; std::array resident_valid; std::array d_matter; @@ -552,6 +601,7 @@ struct StepContext { for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { d_resident[b].fill(nullptr); resident_host[b].fill(nullptr); + resident_host_clean[b].fill(0); } resident_age.fill(0); resident_valid.fill(false); @@ -634,6 +684,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx) for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { ctx.d_resident[b].fill(nullptr); ctx.resident_host[b].fill(nullptr); + ctx.resident_host_clean[b].fill(0); } ctx.resident_age.fill(0); ctx.resident_valid.fill(false); @@ -661,6 +712,7 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc ctx.resident_clock = 0; for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) { ctx.resident_host[b].fill(nullptr); + ctx.resident_host_clean[b].fill(0); } ctx.resident_age.fill(0); ctx.resident_valid.fill(false); @@ -843,6 +895,25 @@ static int *ensure_comm_segment_meta_buffer(size_t needed_ints) return g_comm_segment_meta; } +static void upload_comm_state_soa(const double *state_soa, int state_count) +{ + double soa[3 * BSSN_STATE_COUNT]; + for (int i = 0; i < BSSN_STATE_COUNT; ++i) { + soa[3 * i + 0] = 1.0; + soa[3 * i + 1] = 1.0; + soa[3 * i + 2] = 1.0; + } + if (state_soa) { + const int n = (state_count < BSSN_STATE_COUNT) ? state_count : BSSN_STATE_COUNT; + for (int i = 0; i < n; ++i) { + soa[3 * i + 0] = state_soa[3 * i + 0]; + soa[3 * i + 1] = state_soa[3 * i + 1]; + soa[3 * i + 2] = state_soa[3 * i + 2]; + } + } + CUDA_CHECK(cudaMemcpyToSymbol(d_comm_state_soa, soa, sizeof(soa))); +} + static void upload_grid_params_if_needed(const GridParams &gp) { if (!g_gp_host_cache_valid || @@ -4906,6 +4977,42 @@ static bool resident_key_usable(double **host_key) return true; } +static void set_resident_host_clean(StepContext &ctx, int bank, bool clean) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; + ctx.resident_host_clean[bank].fill(clean ? 1 : 0); +} + +static bool resident_host_subset_clean(const StepContext &ctx, + int bank, + int subset_count, + const int *state_indices) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return false; + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices ? state_indices[i] : i; + if (state_index < 0 || state_index >= BSSN_STATE_COUNT) + return false; + if (!ctx.resident_host_clean[bank][state_index]) + return false; + } + return true; +} + +static void mark_resident_host_subset_clean(StepContext &ctx, + int bank, + int subset_count, + const int *state_indices, + bool clean) +{ + if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; + for (int i = 0; i < subset_count; ++i) { + const int state_index = state_indices ? state_indices[i] : i; + if (state_index >= 0 && state_index < BSSN_STATE_COUNT) + ctx.resident_host_clean[bank][state_index] = clean ? 1 : 0; + } +} + static void mark_resident_current_bank(StepContext &ctx, int bank) { if (bank < 0 || bank >= BSSN_RESIDENT_BANK_COUNT) return; @@ -4942,6 +5049,8 @@ static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) for (int i = 0; i < BSSN_STATE_COUNT; ++i) { if (!ctx.resident_host[bank][i]) return; } + const bool profile = cuda_aux_profile_enabled(); + const double t0 = profile ? cuda_profile_now_ms() : 0.0; const size_t bytes = all * sizeof(double); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { CUDA_CHECK(cudaMemcpyAsync(ctx.resident_host[bank][i], @@ -4949,6 +5058,14 @@ static void writeback_resident_bank(StepContext &ctx, int bank, size_t all) bytes, cudaMemcpyDeviceToHost)); } CUDA_CHECK(cudaDeviceSynchronize()); + set_resident_host_clean(ctx, bank, true); + if (profile) { + CudaAuxProfileStats &stats = cuda_aux_profile_stats(); + stats.writeback_calls++; + stats.writeback_ms += cuda_profile_now_ms() - t0; + stats.writeback_gb += (double)((size_t)BSSN_STATE_COUNT * bytes) / 1.0e9; + cuda_aux_profile_maybe_log(); + } } static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all) @@ -4971,6 +5088,7 @@ static int choose_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size writeback_resident_bank(ctx, best, all); ctx.resident_valid[best] = false; ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); ctx.resident_age[best] = 0; if (ctx.current_bank == best) { ctx.current_bank = -1; @@ -4986,6 +5104,7 @@ static void assign_resident_key(StepContext &ctx, int bank, double **host_key) for (int i = 0; i < BSSN_STATE_COUNT; ++i) { ctx.resident_host[bank][i] = host_key[i]; } + set_resident_host_clean(ctx, bank, false); ctx.resident_age[bank] = ++ctx.resident_clock; } @@ -5008,6 +5127,7 @@ static int ensure_resident_bank(StepContext &ctx, bind_state_input_slots(ctx.d_resident[bank]); upload_state_inputs(host_key, all); ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); } return bank; } @@ -5018,8 +5138,10 @@ static int ensure_resident_bank(StepContext &ctx, bind_state_input_slots(ctx.d_resident[bank]); upload_state_inputs(host_key, all); ctx.resident_valid[bank] = true; + set_resident_host_clean(ctx, bank, true); } else { ctx.resident_valid[bank] = false; + set_resident_host_clean(ctx, bank, false); } update_state_ready(ctx); return bank; @@ -5076,6 +5198,7 @@ static int choose_resident_bank_for_reuse_avoiding(StepContext &ctx, writeback_resident_bank(ctx, best, all); ctx.resident_valid[best] = false; ctx.resident_host[best].fill(nullptr); + ctx.resident_host_clean[best].fill(0); ctx.resident_age[best] = 0; if (ctx.current_bank == best) { ctx.current_bank = -1; @@ -5599,6 +5722,12 @@ __global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem, } } +__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem, + int state_index, + int x, int y, int z, + int nx, int ny, + int all); + __global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem, double * __restrict__ dst, int nx, int ny, @@ -5635,18 +5764,93 @@ __global__ void kern_restrict_state_region_batch(const double * __restrict__ src { const int y = fc_j + offs[oy]; const double wyz = wz * w[oy]; - for (int ox = 0; ox < 6; ++ox) - { - const int x = fc_i + offs[ox]; - const int src = x + y * nx + z * nx * ny; - sum += wyz * w[ox] * src_mem[(size_t)state_index * all + src]; - } + for (int ox = 0; ox < 6; ++ox) + { + const int x = fc_i + offs[ox]; + sum += wyz * w[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } } } dst[(size_t)state_index * region_all + local] = sum; } } +__device__ __forceinline__ double load_comm_state_cell_sym(const double * __restrict__ src_mem, + int state_index, + int x, int y, int z, + int nx, int ny, + int all) +{ + double s = 1.0; + if (x < 0) { + x = -x - 1; + s *= d_comm_state_soa[3 * state_index + 0]; + } + if (y < 0) { + y = -y - 1; + s *= d_comm_state_soa[3 * state_index + 1]; + } + if (z < 0) { + z = -z - 1; + s *= d_comm_state_soa[3 * state_index + 2]; + } + const int src = x + y * nx + z * nx * ny; + return s * src_mem[(size_t)state_index * all + src]; +} + +__global__ void kern_restrict_state_segments_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 8; + const int sx = m[0], sy = m[1]; + const int region_all = m[3]; + const int offset = m[4]; + const int fi0 = m[5], fj0 = m[6], fk0 = m[7]; + if (state_index >= state_count) return; + const double c1 = 3.0 / 256.0; + const double c2 = -25.0 / 256.0; + const double c3 = 75.0 / 128.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double w[6] = {c1, c2, c3, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fc_i = fi0 + 2 * ii; + const int fc_j = fj0 + 2 * jj; + const int fc_k = fk0 + 2 * kk; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) + { + const int z = fc_k + offs[oz]; + const double wz = w[oz]; + for (int oy = 0; oy < 6; ++oy) + { + const int y = fc_j + offs[oy]; + const double wyz = wz * w[oy]; + for (int ox = 0; ox < 6; ++ox) + { + const int x = fc_i + offs[ox]; + sum += wyz * w[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)offset + (size_t)state_index * region_all + local] = sum; + } +} + __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem, double * __restrict__ dst, int nx, int ny, @@ -5697,8 +5901,8 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ for (int ox = 0; ox < 6; ++ox) { const int x = ci + offs[ox]; - const int src = x + y * nx + z * nx * ny; - sum += wyz * wx[ox] * src_mem[(size_t)state_index * all + src]; + sum += wyz * wx[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); } } } @@ -5706,6 +5910,69 @@ __global__ void kern_prolong_state_region_batch(const double * __restrict__ src_ } } +__global__ void kern_prolong_state_segments_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + const int * __restrict__ meta, + int state_count, + int all) +{ + const int segment = blockIdx.z; + const int state_index = blockIdx.y; + const int *m = meta + segment * 11; + const int sx = m[0], sy = m[1]; + const int region_all = m[3]; + const int offset = m[4]; + const int ii0 = m[5], jj0 = m[6], kk0 = m[7]; + const int lbc_i = m[8], lbc_j = m[9], lbc_k = m[10]; + if (state_index >= state_count) return; + const double c1 = 77.0 / 8192.0; + const double c2 = -693.0 / 8192.0; + const double c3 = 3465.0 / 4096.0; + const double c4 = 1155.0 / 4096.0; + const double c5 = -495.0 / 8192.0; + const double c6 = 63.0 / 8192.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double wl[6] = {c1, c2, c3, c4, c5, c6}; + const double wr[6] = {c6, c5, c4, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fine_i = ii0 + ii; + const int fine_j = jj0 + jj; + const int fine_k = kk0 + kk; + const int ci = fine_i / 2 - lbc_i; + const int cj = fine_j / 2 - lbc_j; + const int ck = fine_k / 2 - lbc_k; + const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr; + const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; + const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) + { + const int z = ck + offs[oz]; + const double wzv = wz[oz]; + for (int oy = 0; oy < 6; ++oy) + { + const int y = cj + offs[oy]; + const double wyz = wzv * wy[oy]; + for (int ox = 0; ox < 6; ++ox) + { + const int x = ci + offs[ox]; + sum += wyz * wx[ox] * + load_comm_state_cell_sym(src_mem, state_index, x, y, z, nx, ny, all); + } + } + } + dst[(size_t)offset + (size_t)state_index * region_all + local] = sum; + } +} + __global__ void kern_pack_state_subset(const double * __restrict__ src_mem, double * __restrict__ dst, int subset_count, @@ -5777,6 +6044,8 @@ static void copy_state_region_cuda(void *block_tag, ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); update_state_ready(ctx); + } else { + ctx.resident_host_clean[bank][state_index] = 1; } } @@ -5821,6 +6090,8 @@ static void copy_state_region_packed_cuda(void *block_tag, ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); update_state_ready(ctx); + } else { + ctx.resident_host_clean[bank][state_index] = 1; } } @@ -5855,6 +6126,10 @@ static void copy_state_region_packed_batch_cuda(void *block_tag, CUDA_CHECK(cudaMemcpy(host_buffer, d_comm, total_doubles * sizeof(double), cudaMemcpyDeviceToHost)); + if (sx == ex[0] && sy == ex[1] && sz == ex[2] && + i0 == 0 && j0 == 0 && k0 == 0) { + mark_resident_host_subset_clean(ctx, bank, state_count, nullptr, true); + } } else { CUDA_CHECK(cudaMemcpy(d_comm, host_buffer, total_doubles * sizeof(double), @@ -5890,11 +6165,14 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos direct_download = env ? ((atoi(env) != 0) ? 1 : 0) : 1; } if (direct_download) { + if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + return; for (int i = 0; i < BSSN_STATE_COUNT; ++i) { CUDA_CHECK(cudaMemcpyAsync(state_host_out[i], ctx.d_resident[bank][i], bytes, cudaMemcpyDeviceToHost)); } CUDA_CHECK(cudaDeviceSynchronize()); + set_resident_host_clean(ctx, bank, true); if (profile) { CudaProfileStats &stats = cuda_profile_stats(); stats.resident_download_calls++; @@ -5903,12 +6181,15 @@ static void download_resident_state(void *block_tag, int *ex, double **state_hos } return; } + if (resident_host_subset_clean(ctx, bank, BSSN_STATE_COUNT, nullptr)) + return; CUDA_CHECK(cudaMemcpy(g_buf.h_stage, ctx.d_resident_mem[bank], (size_t)BSSN_STATE_COUNT * bytes, cudaMemcpyDeviceToHost)); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } + set_resident_host_clean(ctx, bank, true); if (profile) { CudaProfileStats &stats = cuda_profile_stats(); stats.resident_download_calls++; @@ -5939,6 +6220,9 @@ static void copy_state_subset(void *block_tag, for (int i = 0; i < subset_count; ++i) { const int state_index = state_indices[i]; if (state_index < 0 || state_index >= BSSN_STATE_COUNT) continue; + if (kind == cudaMemcpyDeviceToHost && + ctx.resident_host_clean[bank][state_index]) + continue; if (!state_host[i]) continue; active_state_indices[active_count] = state_index; active_state_host[active_count] = state_host[i]; @@ -5965,6 +6249,8 @@ static void copy_state_subset(void *block_tag, h_comm + (size_t)i * all, bytes); } + mark_resident_host_subset_clean(ctx, bank, active_count, + active_state_indices, true); } else { for (int i = 0; i < active_count; ++i) { std::memcpy(h_comm + (size_t)i * all, @@ -5980,6 +6266,8 @@ static void copy_state_subset(void *block_tag, ctx.resident_valid[bank] = true; ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); + mark_resident_host_subset_clean(ctx, bank, active_count, + active_state_indices, true); update_state_ready(ctx); } } @@ -6423,6 +6711,8 @@ int bssn_cuda_rk4_substep(void *block_tag, g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz], g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz], g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); + if (use_resident_state && input_bank >= 0) + set_resident_host_clean(ctx, input_bank, false); } if (profile) { cuda_profile_sync(); @@ -6493,6 +6783,7 @@ int bssn_cuda_rk4_substep(void *block_tag, if (use_resident_state) { ctx.resident_valid[output_bank] = true; ctx.resident_age[output_bank] = ++ctx.resident_clock; + set_resident_host_clean(ctx, output_bank, false); mark_resident_current_bank(ctx, output_bank); update_state_ready(ctx); } else { @@ -6683,7 +6974,7 @@ static void copy_state_device_batch(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - pack_not_unpack == 0); + pack_not_unpack == 0 || state_host_key != nullptr); double *base_mem = ctx.d_resident_mem[bank]; const int region_all = sx * sy * sz; dim3 launch_grid((unsigned int)grid((size_t)region_all), @@ -6704,6 +6995,7 @@ static void copy_state_device_batch(void *block_tag, ctx.resident_valid[bank] = true; ctx.resident_age[bank] = ++ctx.resident_clock; mark_resident_current_bank(ctx, bank); + set_resident_host_clean(ctx, bank, false); update_state_ready(ctx); } } @@ -6731,7 +7023,7 @@ static void copy_state_device_segments(void *block_tag, StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, - pack_not_unpack == 0); + pack_not_unpack == 0 || state_host_key != nullptr); double *base_mem = ctx.d_resident_mem[bank]; int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, @@ -6758,6 +7050,84 @@ static void copy_state_device_segments(void *block_tag, } } +static void restrict_state_device_segments(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int segment_count, + const int *segment_meta, + double **state_host_key = nullptr, + const double *state_soa = nullptr) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (segment_count <= 0 || !segment_meta || !device_buffer) return; + + int max_region_all = 0; + for (int s = 0; s < segment_count; ++s) { + const int *m = segment_meta + s * 8; + if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return; + if (m[3] > max_region_all) max_region_all = m[3]; + } + if (max_region_all <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + state_host_key != nullptr); + int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 8); + CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, + (size_t)segment_count * 8 * sizeof(int), + cudaMemcpyHostToDevice)); + upload_comm_state_soa(state_soa, state_count); + + dim3 launch_grid((unsigned int)grid((size_t)max_region_all), + (unsigned int)state_count, + (unsigned int)segment_count); + kern_restrict_state_segments_batch<<>>( + ctx.d_resident_mem[bank], device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); +} + +static void prolong_state_device_segments(void *block_tag, + int state_count, + double *device_buffer, + const int *ex, + int segment_count, + const int *segment_meta, + double **state_host_key = nullptr, + const double *state_soa = nullptr) +{ + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return; + if (segment_count <= 0 || !segment_meta || !device_buffer) return; + + int max_region_all = 0; + for (int s = 0; s < segment_count; ++s) { + const int *m = segment_meta + s * 11; + if (m[0] <= 0 || m[1] <= 0 || m[2] <= 0 || m[3] <= 0) return; + if (m[3] > max_region_all) max_region_all = m[3]; + } + if (max_region_all <= 0) return; + + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + const int bank = active_or_keyed_bank(ctx, state_host_key, all, + state_host_key != nullptr); + int *d_meta = ensure_comm_segment_meta_buffer((size_t)segment_count * 11); + CUDA_CHECK(cudaMemcpy(d_meta, segment_meta, + (size_t)segment_count * 11 * sizeof(int), + cudaMemcpyHostToDevice)); + upload_comm_state_soa(state_soa, state_count); + + dim3 launch_grid((unsigned int)grid((size_t)max_region_all), + (unsigned int)state_count, + (unsigned int)segment_count); + kern_prolong_state_segments_batch<<>>( + ctx.d_resident_mem[bank], device_buffer, + ex[0], ex[1], d_meta, state_count, + ex[0] * ex[1] * ex[2]); +} + extern "C" int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag, int state_count, @@ -6882,6 +7252,70 @@ int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *bloc return 0; } +extern "C" +int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + restrict_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta); + return 0; +} + +extern "C" +int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + restrict_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, state_host_key, state_soa); + return 0; +} + +extern "C" +int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + prolong_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta); + return 0; +} + +extern "C" +int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + prolong_state_device_segments(block_tag, state_count, device_buffer, ex, + segment_count, segment_meta, state_host_key, state_soa); + return 0; +} + extern "C" int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int state_count, @@ -6896,6 +7330,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const int region_all = sx * sy * sz; + upload_comm_state_soa(nullptr, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); kern_restrict_state_region_batch<<>>( @@ -6913,7 +7348,8 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t double *device_buffer, int *ex, int sx, int sy, int sz, - int fi0, int fj0, int fk0) + int fi0, int fj0, int fk0, + const double *state_soa) { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); @@ -6923,6 +7359,7 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); kern_restrict_state_region_batch<<>>( @@ -6948,6 +7385,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); const int region_all = sx * sy * sz; + upload_comm_state_soa(nullptr, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); kern_prolong_state_region_batch<<>>( @@ -6967,7 +7405,8 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta int *ex, int sx, int sy, int sz, int ii0, int jj0, int kk0, - int lbc_i, int lbc_j, int lbc_k) + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa) { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); @@ -6977,6 +7416,7 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta const size_t all = (size_t)ex[0] * ex[1] * ex[2]; const int bank = active_or_keyed_bank(ctx, state_host_key, all, true); const int region_all = sx * sy * sz; + upload_comm_state_soa(state_soa, state_count); dim3 launch_grid((unsigned int)grid((size_t)region_all), (unsigned int)state_count); kern_prolong_state_region_batch<<>>( @@ -7028,6 +7468,8 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, { init_gpu_dispatch(); CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + const bool profile = cuda_aux_profile_enabled(); + const double t0 = profile ? cuda_profile_now_ms() : 0.0; if (source_count != 2 && source_count != 3) return 1; if (!resident_key_usable(src1_host_key) || !resident_key_usable(src2_host_key) || @@ -7074,10 +7516,19 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag, (source_count == 3) ? ctx.d_resident_mem[src3_bank] : nullptr, ctx.d_resident_mem[dst_bank], c1, c2, c3, BSSN_STATE_COUNT, (int)all); + if (profile) + cuda_profile_sync(); ctx.resident_valid[dst_bank] = true; ctx.resident_age[dst_bank] = ++ctx.resident_clock; + set_resident_host_clean(ctx, dst_bank, false); mark_resident_current_bank(ctx, dst_bank); update_state_ready(ctx); + if (profile) { + CudaAuxProfileStats &stats = cuda_aux_profile_stats(); + stats.prepare_calls++; + stats.prepare_ms += cuda_profile_now_ms() - t0; + cuda_aux_profile_maybe_log(); + } return 0; } diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index fda21d6..3513dd7 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -180,6 +180,38 @@ int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *bloc int segment_count, const int *segment_meta); +int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + +int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa); + +int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta); + +int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag, + double **state_host_key, + int state_count, + double *device_buffer, + int *ex, + int segment_count, + const int *segment_meta, + const double *state_soa); + int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int state_count, double *device_buffer, @@ -193,7 +225,8 @@ int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_t double *device_buffer, int *ex, int sx, int sy, int sz, - int fi0, int fj0, int fk0); + int fi0, int fj0, int fk0, + const double *state_soa); int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int state_count, @@ -210,7 +243,8 @@ int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_ta int *ex, int sx, int sy, int sz, int ii0, int jj0, int kk0, - int lbc_i, int lbc_j, int lbc_k); + int lbc_i, int lbc_j, int lbc_k, + const double *state_soa); int bssn_cuda_download_state_subset(void *block_tag, int *ex,