diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C index 898f7aa..79a89fe 100644 --- a/AMSS_NCKU_source/Parallel.C +++ b/AMSS_NCKU_source/Parallel.C @@ -173,6 +173,96 @@ int cuda_state_var_count(MyList *src_vars, MyList *dst_vars) } #if USE_CUDA_BSSN || USE_CUDA_Z4C +int fortran_idint(double x) +{ + return (int)x; +} + +bool cuda_cell_gw3_restrict_params(const Parallel::gridseg *src, + const Parallel::gridseg *dst, + int first_fine[3]) +{ +#if USE_CUDA_BSSN && defined(Cell) && (ghost_width == 3) + if (!src || !dst || !src->Bg || !dst->Bg) + return false; + for (int d = 0; d < dim; ++d) + { + const double llbc = dst->llb[d]; + const double uubc = dst->uub[d]; + const double llbf = src->Bg->bbox[d]; + const double uubf = src->Bg->bbox[d + dim]; + const double CD = (uubc - llbc) / (double)dst->shape[d]; + const double FD = (uubf - llbf) / (double)src->Bg->shape[d]; + if (fabs(CD - 2.0 * FD) > 1.0e-10) + return false; + double base; + if (llbc <= llbf) + base = llbc; + else + { + const int j = fortran_idint((llbc - llbf) / FD + 0.4); + base = ((j / 2) * 2 == j) ? llbf : (llbf - CD / 2.0); + } + const int lbc = fortran_idint((llbc - base) / CD + 0.4) + 1; + const int lbf = fortran_idint((llbf - base) / FD + 0.4) + 1; + first_fine[d] = 2 * lbc - lbf - 1; + if (first_fine[d] - 2 < 0) + return false; + if (first_fine[d] + 2 * (dst->shape[d] - 1) + 3 >= src->Bg->shape[d]) + return false; + } + return true; +#else + (void)src; (void)dst; (void)first_fine; + return false; +#endif +} + +bool cuda_cell_gw3_prolong_params(const Parallel::gridseg *src, + const Parallel::gridseg *dst, + int first_fine_ii[3], + int coarse_lb[3]) +{ +#if USE_CUDA_BSSN && defined(Cell) && (ghost_width == 3) + if (!src || !dst || !src->Bg || !dst->Bg) + return false; + for (int d = 0; d < dim; ++d) + { + const double llbc = src->Bg->bbox[d]; + const double uubc = src->Bg->bbox[d + dim]; + const double llbf = dst->llb[d]; + const double uubf = dst->uub[d]; + const double CD = (uubc - llbc) / (double)src->Bg->shape[d]; + const double FD = (uubf - llbf) / (double)dst->shape[d]; + if (fabs(CD - 2.0 * FD) > 1.0e-10) + return false; + double base; + if (llbc <= llbf) + base = llbc; + else + { + const int j = fortran_idint((llbc - llbf) / FD + 0.4); + base = ((j / 2) * 2 == j) ? llbf : (llbf - CD / 2.0); + } + const int lbf = fortran_idint((llbf - base) / FD + 0.4) + 1; + const int lbc = fortran_idint((llbc - base) / CD + 0.4) + 1; + first_fine_ii[d] = lbf; + coarse_lb[d] = lbc; + const int first_coarse = first_fine_ii[d] / 2 - coarse_lb[d]; + const int last_fine_ii = first_fine_ii[d] + dst->shape[d] - 1; + const int last_coarse = last_fine_ii / 2 - coarse_lb[d]; + if (first_coarse - 2 < 0) + return false; + if (last_coarse + 3 >= src->Bg->shape[d]) + return false; + } + return true; +#else + (void)src; (void)dst; (void)first_fine_ii; (void)coarse_lb; + return false; +#endif +} + bool cuda_state_count_direct_supported(int state_count) { #if USE_CUDA_Z4C && (ABEtype == 2) @@ -187,22 +277,36 @@ bool cuda_state_count_direct_supported(int state_count) bool cuda_can_direct_pack(const Parallel::gridseg *src, const Parallel::gridseg *dst, int type) { - if (type != 1 || !src || !dst || !src->Bg) + if (!src || !dst || !src->Bg) return false; #if USE_CUDA_Z4C && (ABEtype == 2) + if (type != 1) + return false; return z4c_cuda_has_resident_state(src->Bg) != 0; #elif USE_CUDA_BSSN - return bssn_cuda_has_resident_state(src->Bg) != 0; + if (bssn_cuda_has_resident_state(src->Bg) == 0) + return false; + if (type == 1) + return true; + int a[3], b[3]; + if (type == 2) + return cuda_cell_gw3_restrict_params(src, dst, a); + if (type == 3) + return cuda_cell_gw3_prolong_params(src, dst, a, b); + return false; #else + (void)type; return false; #endif } bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type) { - if (type != 1 || !dst || !dst->Bg) + if (type < 1 || type > 3 || !dst || !dst->Bg) return false; #if USE_CUDA_Z4C && (ABEtype == 2) + if (type != 1) + return false; return z4c_cuda_has_resident_state(dst->Bg) != 0; #elif USE_CUDA_BSSN return bssn_cuda_has_resident_state(dst->Bg) != 0; @@ -336,24 +440,50 @@ void ensure_device_comm_buffer(double **buffers, int *caps, int idx, int length) bool cuda_direct_pack_segment_to_device(double *buffer, const Parallel::gridseg *src, const Parallel::gridseg *dst, - int state_count) + int state_count, + int type) { #if USE_CUDA_BSSN if (state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) return false; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; - const int i0 = cuda_seg_begin(dst, src->Bg, 0); - const int j0 = cuda_seg_begin(dst, src->Bg, 1); - const int k0 = cuda_seg_begin(dst, src->Bg, 2); - const bool ok = bssn_cuda_pack_state_batch_to_device_buffer( - src->Bg, state_count, buffer, src->Bg->shape, - i0, j0, k0, - dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + bool ok = false; + if (type == 1) + { + const int i0 = cuda_seg_begin(dst, src->Bg, 0); + const int j0 = cuda_seg_begin(dst, src->Bg, 1); + const int k0 = cuda_seg_begin(dst, src->Bg, 2); + ok = bssn_cuda_pack_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; + } + else if (type == 2) + { + int first_fine[3]; + if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) + return false; + ok = bssn_cuda_restrict_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine[0], first_fine[1], first_fine[2]) == 0; + } + else if (type == 3) + { + int first_fine_ii[3], coarse_lb[3]; + if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) + return false; + ok = bssn_cuda_prolong_state_batch_to_device_buffer( + src->Bg, state_count, buffer, src->Bg->shape, + dst->shape[0], dst->shape[1], dst->shape[2], + first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], + coarse_lb[0], coarse_lb[1], coarse_lb[2]) == 0; + } if (sync_profile_enabled()) sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; return ok; #else - (void)buffer; (void)src; (void)dst; (void)state_count; + (void)buffer; (void)src; (void)dst; (void)state_count; (void)type; return false; #endif } @@ -382,6 +512,56 @@ bool cuda_direct_unpack_segment_from_device(double *buffer, #endif } +bool cuda_download_resident_subset_to_host(Block *block, + MyList *vars, + int state_count) +{ +#if USE_CUDA_BSSN + if (!block || state_count <= 0 || state_count > BSSN_CUDA_STATE_COUNT) + return false; + if (bssn_cuda_has_resident_state(block) == 0) + return true; + int indices[BSSN_CUDA_STATE_COUNT]; + double *views[BSSN_CUDA_STATE_COUNT]; + MyList *v = vars; + for (int i = 0; i < state_count; ++i) + { + if (!v) + return false; + indices[i] = i; + views[i] = block->fgfs[v->data->sgfn]; + v = v->next; + } + return bssn_cuda_download_state_subset(block, block->shape, state_count, indices, views) == 0; +#else + (void)block; (void)vars; (void)state_count; + return false; +#endif +} + +bool cuda_unpack_host_region_to_resident(Block *block, + int state_index, + double *buffer, + const Parallel::gridseg *dst) +{ +#if USE_CUDA_BSSN + if (!block || !dst || state_index < 0 || state_index >= BSSN_CUDA_STATE_COUNT) + return false; + if (bssn_cuda_has_resident_state(block) == 0) + return true; + const int i0 = cuda_seg_begin(dst, block, 0); + const int j0 = cuda_seg_begin(dst, block, 1); + const int k0 = cuda_seg_begin(dst, block, 2); + return bssn_cuda_unpack_state_region_from_host_buffer( + block, state_index, buffer, block->shape, + i0, j0, k0, + dst->shape[0], dst->shape[1], dst->shape[2]) == 0; +#else + (void)block; (void)state_index; (void)buffer; (void)dst; + return false; +#endif +} + bool cuda_device_state_count_supported(int state_count) { #if USE_CUDA_BSSN @@ -499,11 +679,12 @@ int cuda_data_packer_device_batched(double *data, } #endif -bool cuda_segments_same_level(MyList *src, - MyList *dst, - int rank_in, - int dir, - int myrank) +bool cuda_segments_device_eligible(MyList *src, + MyList *dst, + int rank_in, + int dir, + int myrank, + int state_count) { bool has_work = false; while (src && dst) @@ -512,13 +693,30 @@ bool cuda_segments_same_level(MyList *src, (dir == UNPACK && src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank)) { has_work = true; - if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg || - src->data->Bg->lev != dst->data->Bg->lev) + if (!src->data || !dst->data || !src->data->Bg || !dst->data->Bg) return false; + int type; + if (src->data->Bg->lev == dst->data->Bg->lev) + type = 1; + else if (src->data->Bg->lev > dst->data->Bg->lev) + type = 2; + else + type = 3; + if (dir == PACK) + { + if (!cuda_can_direct_pack(src->data, dst->data, type)) + return false; + } + else + { + if (!cuda_can_direct_unpack(dst->data, type)) + return false; + } } src = src->next; dst = dst->next; } + (void)state_count; return has_work; } @@ -530,16 +728,8 @@ bool cuda_pack_to_device_eligible(MyList *src, { if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count)) return false; - if (!cuda_segments_same_level(src, dst, rank_in, PACK, myrank)) + if (!cuda_segments_device_eligible(src, dst, rank_in, PACK, myrank, state_count)) return false; - while (src && dst) - { - if (dst->data->Bg->rank == rank_in && src->data->Bg->rank == myrank && - !cuda_can_direct_pack(src->data, dst->data, 1)) - return false; - src = src->next; - dst = dst->next; - } return true; } @@ -551,16 +741,8 @@ bool cuda_recv_to_device_eligible(MyList *src, { if (!cuda_aware_mpi_enabled() || !cuda_device_state_count_supported(state_count)) return false; - if (!cuda_segments_same_level(src, dst, rank_in, UNPACK, myrank)) + if (!cuda_segments_device_eligible(src, dst, rank_in, UNPACK, myrank, state_count)) return false; - while (src && dst) - { - if (src->data->Bg->rank == rank_in && dst->data->Bg->rank == myrank && - !cuda_can_direct_unpack(dst->data, 1)) - return false; - src = src->next; - dst = dst->next; - } return true; } @@ -4355,11 +4537,12 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, dst->data, type)) { if (s_cuda_aware_pack_active) { - handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count); + handled_by_cuda = cuda_direct_pack_segment_to_device(data + size_out, src->data, dst->data, state_count, type); } else { handled_by_cuda = cuda_direct_pack_segment(data + size_out, src->data, dst->data, state_count); } @@ -4369,7 +4552,8 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data, type)) { if (s_cuda_aware_pack_active) { @@ -4393,6 +4577,17 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data && src->data->Bg && bssn_cuda_has_resident_state(src->data->Bg)) + { + if (!cuda_download_resident_subset_to_host(src->data->Bg, VarLists, state_count)) + { + cout << "Parallel::data_packer: CUDA resident fallback download failed." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } #endif if (dir == PACK) switch (type) @@ -4414,9 +4609,22 @@ int Parallel::data_packer(double *data, MyList *src, MyList

data->llb, dst->data->uub, varls->data->SoA, Symmetry); } if (dir == UNPACK) // from target data to corresponding grid + { f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub); +#if USE_CUDA_BSSN + if (cuda_state_count_direct_supported(state_count) && + dst->data && dst->data->Bg && bssn_cuda_has_resident_state(dst->data->Bg)) + { + if (!cuda_unpack_host_region_to_resident(dst->data->Bg, state_idx, data + size_out, dst->data)) + { + cout << "Parallel::data_packer: CUDA resident fallback upload failed." << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + } +#endif + } #if USE_CUDA_BSSN || USE_CUDA_Z4C } else diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 878aa46..cdd56e3 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -271,6 +271,24 @@ bool bssn_cuda_use_resident_sync(int lev) #endif } +bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + if (!enabled) + return false; + // Levels at and above trfls can be read by CPU time interpolation through + // State/Old/corrector lists. Keep those conservative until multi-time-level + // resident storage is implemented. + if (lev == analysis_lev) + return false; + return lev < trfls_in; +} + bool bssn_constraint_recompute_from_state(int lev, bool level0_cache_valid) { #if USE_CUDA_BSSN @@ -4131,6 +4149,12 @@ void bssn_class::Step(int lev, int YN) // Warning NOTE: the variables1 are used as temp storege room if (lev == a_lev) { +#if USE_CUDA_BSSN + const bool need_analysis_state_before_predictor = + (LastAnas + dT_lev >= AnasTime); + if (use_cuda_resident_sync && need_analysis_state_before_predictor) + bssn_cuda_download_level_state(GH->PatL[lev], StateList, myrank, false); +#endif AnalysisStuff(lev, dT_lev); } #endif @@ -4623,7 +4647,10 @@ void bssn_class::Step(int lev, int YN) } #if USE_CUDA_BSSN if (use_cuda_resident_sync) - bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + { + if (!bssn_cuda_keep_resident_after_step(lev, trfls, a_lev)) + bssn_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true); + } #endif #if (RPS == 0) // mesh refinement boundary part diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index f84be43..03b2b73 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -5269,6 +5269,113 @@ __global__ void kern_unpack_state_segments_batch(double * __restrict__ dst_mem, } } +__global__ void kern_restrict_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int sx, int sy, int sz, + int fi0, int fj0, int fk0, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + const double c1 = 3.0 / 256.0; + const double c2 = -25.0 / 256.0; + const double c3 = 75.0 / 128.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double w[6] = {c1, c2, c3, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fc_i = fi0 + 2 * ii; + const int fc_j = fj0 + 2 * jj; + const int fc_k = fk0 + 2 * kk; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) + { + const int z = fc_k + offs[oz]; + const double wz = w[oz]; + for (int oy = 0; oy < 6; ++oy) + { + const int y = fc_j + offs[oy]; + const double wyz = wz * w[oy]; + for (int ox = 0; ox < 6; ++ox) + { + const int x = fc_i + offs[ox]; + const int src = x + y * nx + z * nx * ny; + sum += wyz * w[ox] * src_mem[(size_t)state_index * all + src]; + } + } + } + dst[(size_t)state_index * region_all + local] = sum; + } +} + +__global__ void kern_prolong_state_region_batch(const double * __restrict__ src_mem, + double * __restrict__ dst, + int nx, int ny, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k, + int region_all, + int state_count, + int all) +{ + const int state_index = blockIdx.y; + if (state_index >= state_count) return; + const double c1 = 77.0 / 8192.0; + const double c2 = -693.0 / 8192.0; + const double c3 = 3465.0 / 4096.0; + const double c4 = 1155.0 / 4096.0; + const double c5 = -495.0 / 8192.0; + const double c6 = 63.0 / 8192.0; + const int offs[6] = {-2, -1, 0, 1, 2, 3}; + const double wl[6] = {c1, c2, c3, c4, c5, c6}; + const double wr[6] = {c6, c5, c4, c3, c2, c1}; + + for (int local = blockIdx.x * blockDim.x + threadIdx.x; + local < region_all; + local += blockDim.x * gridDim.x) + { + const int ii = local % sx; + const int jj = (local / sx) % sy; + const int kk = local / (sx * sy); + const int fine_i = ii0 + ii; + const int fine_j = jj0 + jj; + const int fine_k = kk0 + kk; + const int ci = fine_i / 2 - lbc_i; + const int cj = fine_j / 2 - lbc_j; + const int ck = fine_k / 2 - lbc_k; + const double *wx = ((fine_i / 2) * 2 == fine_i) ? wl : wr; + const double *wy = ((fine_j / 2) * 2 == fine_j) ? wl : wr; + const double *wz = ((fine_k / 2) * 2 == fine_k) ? wl : wr; + double sum = 0.0; + for (int oz = 0; oz < 6; ++oz) + { + const int z = ck + offs[oz]; + const double wzv = wz[oz]; + for (int oy = 0; oy < 6; ++oy) + { + const int y = cj + offs[oy]; + const double wyz = wzv * wy[oy]; + for (int ox = 0; ox < 6; ++ox) + { + const int x = ci + offs[ox]; + const int src = x + y * nx + z * nx * ny; + sum += wyz * wx[ox] * src_mem[(size_t)state_index * all + src]; + } + } + } + dst[(size_t)state_index * region_all + local] = sum; + } +} + __global__ void kern_pack_state_subset(const double * __restrict__ src_mem, double * __restrict__ dst, int subset_count, @@ -5929,7 +6036,7 @@ int bssn_cuda_rk4_substep(void *block_tag, bind_state_output_slots(ctx.d_state_next); } double t0 = profile ? cuda_profile_now_ms() : 0.0; - if (!use_resident_state || RK4 == 0 || !ctx.state_ready) { + if (!use_resident_state || !ctx.state_ready) { upload_state_inputs(state_host_in, all); } if (apply_enforce_ga) { @@ -6274,6 +6381,56 @@ int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, return 0; } +extern "C" +int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_restrict_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + fi0, fj0, fk0, region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + +extern "C" +int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (state_count <= 0 || state_count > BSSN_STATE_COUNT) return 1; + if (!device_buffer || sx <= 0 || sy <= 0 || sz <= 0) return 1; + StepContext &ctx = ensure_step_ctx(block_tag, (size_t)ex[0] * ex[1] * ex[2]); + const int region_all = sx * sy * sz; + dim3 launch_grid((unsigned int)grid((size_t)region_all), + (unsigned int)state_count); + kern_prolong_state_region_batch<<>>( + ctx.d_state_curr_mem, device_buffer, + ex[0], ex[1], sx, sy, sz, + ii0, jj0, kk0, lbc_i, lbc_j, lbc_k, + region_all, state_count, + ex[0] * ex[1] * ex[2]); + return 0; +} + extern "C" int bssn_cuda_download_state_subset(void *block_tag, int *ex, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index 37b6ab4..5df93a9 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -132,6 +132,21 @@ int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int segment_count, const int *segment_meta); +int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int fi0, int fj0, int fk0); + +int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag, + int state_count, + double *device_buffer, + int *ex, + int sx, int sy, int sz, + int ii0, int jj0, int kk0, + int lbc_i, int lbc_j, int lbc_k); + int bssn_cuda_download_state_subset(void *block_tag, int *ex, int subset_count,