From d9c7ea8085d8923d858066b73447a2a73db868a3 Mon Sep 17 00:00:00 2001 From: abnerluo Date: Tue, 28 Apr 2026 08:23:34 +0800 Subject: [PATCH] Use cudaMemcpyAsync with dedicated transfer stream for H2D/D2H transfers Add cudaStream_t to GpuBuffers for async H2D/D2H transfers in BSSN and Z4C substep functions. Adds cudaStreamSynchronize(0) before D2H to enforce kernel/transfer ordering across streams, and a sync between state and matter H2D uploads to prevent h_stage race on RK4==0. Co-Authored-By: Claude Sonnet 4.6 --- AMSS_NCKU_source/bssn_rhs_cuda.cu | 42 +++++++++++++++++++------------ AMSS_NCKU_source/z4c_rhs_cuda.cu | 41 ++++++++++++++++++------------ 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index ad31c7f..0c36a62 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -279,11 +279,12 @@ struct GpuBuffers { size_t cap_fh3_size; int prev_nx, prev_ny, prev_nz; bool initialized; + cudaStream_t stream; /* dedicated transfer stream */ }; static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, false, {}, - 0, 0, 0, 0, 0, 0, false + 0, 0, 0, 0, 0, 0, false, nullptr }; /* Slot assignments — INPUT (H2D) */ @@ -565,6 +566,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { || (fh3_size > g_buf.cap_fh3_size); if (need_grow) { + if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; } if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } @@ -592,6 +594,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { } } + if (!g_buf.stream) + CUDA_CHECK(cudaStreamCreate(&g_buf.stream)); + g_buf.cap_all = all; g_buf.cap_fh2_size = fh2_size; g_buf.cap_fh3_size = fh3_size; @@ -4646,9 +4651,9 @@ static void upload_state_inputs(double **state_host, size_t all) for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes); } - CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage, - (size_t)BSSN_STATE_COUNT * bytes, - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage, + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyHostToDevice, g_buf.stream)); } static void upload_matter_cache(StepContext &ctx, @@ -4659,9 +4664,9 @@ static void upload_matter_cache(StepContext &ctx, for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes); } - CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage, - (size_t)BSSN_MATTER_COUNT * bytes, - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage, + (size_t)BSSN_MATTER_COUNT * bytes, + cudaMemcpyHostToDevice, g_buf.stream)); ctx.matter_ready = true; } @@ -4989,9 +4994,11 @@ static void launch_rhs_pipeline(int all, double eps, int co) static void download_state_outputs(double **state_host_out, size_t all) { const size_t bytes = all * sizeof(double); - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs], - (size_t)BSSN_STATE_COUNT * bytes, - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost, g_buf.stream)); + CUDA_CHECK(cudaStreamSynchronize(g_buf.stream)); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } @@ -5000,9 +5007,11 @@ static void download_state_outputs(double **state_host_out, size_t all) static void download_constraint_outputs(double **constraint_host_out, size_t all) { const size_t bytes = all * sizeof(double); - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res], - (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes, - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res], + (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes, + cudaMemcpyDeviceToHost, g_buf.stream)); + CUDA_CHECK(cudaStreamSynchronize(g_buf.stream)); for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) { std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } @@ -5708,11 +5717,12 @@ int bssn_cuda_rk4_substep(void *block_tag, if (use_zero_matter) { if (!ctx.matter_ready) zero_matter_cache(ctx, all); } else { + CUDA_CHECK(cudaStreamSynchronize(g_buf.stream)); upload_matter_cache(ctx, matter_host, all); } - CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], - (size_t)BSSN_STATE_COUNT * bytes, - cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice, g_buf.stream)); } else if (!ctx.matter_ready) { if (use_zero_matter) zero_matter_cache(ctx, all); else upload_matter_cache(ctx, matter_host, all); diff --git a/AMSS_NCKU_source/z4c_rhs_cuda.cu b/AMSS_NCKU_source/z4c_rhs_cuda.cu index 79a11c1..2e7ddeb 100644 --- a/AMSS_NCKU_source/z4c_rhs_cuda.cu +++ b/AMSS_NCKU_source/z4c_rhs_cuda.cu @@ -292,11 +292,12 @@ struct GpuBuffers { size_t cap_fh3_size; int prev_nx, prev_ny, prev_nz; bool initialized; + cudaStream_t stream; /* dedicated transfer stream */ }; static GpuBuffers g_buf = { nullptr, nullptr, nullptr, nullptr, false, {}, - 0, 0, 0, 0, 0, 0, false + 0, 0, 0, 0, 0, 0, false, nullptr }; /* Slot assignments — INPUT (H2D) */ @@ -595,6 +596,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { || (fh3_size > g_buf.cap_fh3_size); if (need_grow) { + if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; } if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; } if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; } if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; } @@ -622,6 +624,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) { } } + if (!g_buf.stream) + CUDA_CHECK(cudaStreamCreate(&g_buf.stream)); + g_buf.cap_all = all; g_buf.cap_fh2_size = fh2_size; g_buf.cap_fh3_size = fh3_size; @@ -4679,9 +4684,9 @@ static void upload_state_inputs(double **state_host, size_t all) for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes); } - CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage, - (size_t)BSSN_STATE_COUNT * bytes, - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage, + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyHostToDevice, g_buf.stream)); } static void upload_matter_cache(StepContext &ctx, @@ -4692,9 +4697,9 @@ static void upload_matter_cache(StepContext &ctx, for (int i = 0; i < BSSN_MATTER_COUNT; ++i) { std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes); } - CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage, - (size_t)BSSN_MATTER_COUNT * bytes, - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage, + (size_t)BSSN_MATTER_COUNT * bytes, + cudaMemcpyHostToDevice, g_buf.stream)); ctx.matter_ready = true; } @@ -5022,9 +5027,11 @@ static void launch_rhs_pipeline(int all, double eps, int co) static void download_state_outputs(double **state_host_out, size_t all) { const size_t bytes = all * sizeof(double); - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs], - (size_t)BSSN_STATE_COUNT * bytes, - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToHost, g_buf.stream)); + CUDA_CHECK(cudaStreamSynchronize(g_buf.stream)); for (int i = 0; i < BSSN_STATE_COUNT; ++i) { std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } @@ -5033,9 +5040,11 @@ static void download_state_outputs(double **state_host_out, size_t all) static void download_constraint_outputs(double **constraint_host_out, size_t all) { const size_t bytes = all * sizeof(double); - CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res], - (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes, - cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaStreamSynchronize(0)); + CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res], + (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes, + cudaMemcpyDeviceToHost, g_buf.stream)); + CUDA_CHECK(cudaStreamSynchronize(g_buf.stream)); for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) { std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); } @@ -7306,9 +7315,9 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag, g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]); } if (RK4 == 0) { - CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi], - (size_t)BSSN_STATE_COUNT * bytes, - cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi], + (size_t)BSSN_STATE_COUNT * bytes, + cudaMemcpyDeviceToDevice, g_buf.stream)); } if (profile) { cuda_profile_sync();