Use cudaMemcpyAsync with dedicated transfer stream for H2D/D2H transfers

Add cudaStream_t to GpuBuffers for async H2D/D2H transfers in BSSN and
Z4C substep functions. Adds cudaStreamSynchronize(0) before D2H to
enforce kernel/transfer ordering across streams, and a sync between
state and matter H2D uploads to prevent h_stage race on RK4==0.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
abnerluo
2026-04-28 08:23:34 +08:00
parent c689cc8dc9
commit d9c7ea8085
2 changed files with 51 additions and 32 deletions

View File

@@ -279,11 +279,12 @@ struct GpuBuffers {
size_t cap_fh3_size;
int prev_nx, prev_ny, prev_nz;
bool initialized;
cudaStream_t stream; /* dedicated transfer stream */
};
static GpuBuffers g_buf = {
nullptr, nullptr, nullptr, nullptr, false, {},
0, 0, 0, 0, 0, 0, false
0, 0, 0, 0, 0, 0, false, nullptr
};
/* Slot assignments — INPUT (H2D) */
@@ -565,6 +566,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|| (fh3_size > g_buf.cap_fh3_size);
if (need_grow) {
if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; }
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
@@ -592,6 +594,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
}
}
if (!g_buf.stream)
CUDA_CHECK(cudaStreamCreate(&g_buf.stream));
g_buf.cap_all = all;
g_buf.cap_fh2_size = fh2_size;
g_buf.cap_fh3_size = fh3_size;
@@ -4646,9 +4651,9 @@ static void upload_state_inputs(double **state_host, size_t all)
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
}
static void upload_matter_cache(StepContext &ctx,
@@ -4659,9 +4664,9 @@ static void upload_matter_cache(StepContext &ctx,
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
ctx.matter_ready = true;
}
@@ -4989,9 +4994,11 @@ static void launch_rhs_pipeline(int all, double eps, int co)
static void download_state_outputs(double **state_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -5000,9 +5007,11 @@ static void download_state_outputs(double **state_host_out, size_t all)
static void download_constraint_outputs(double **constraint_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -5708,11 +5717,12 @@ int bssn_cuda_rk4_substep(void *block_tag,
if (use_zero_matter) {
if (!ctx.matter_ready) zero_matter_cache(ctx, all);
} else {
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
upload_matter_cache(ctx, matter_host, all);
}
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice, g_buf.stream));
} else if (!ctx.matter_ready) {
if (use_zero_matter) zero_matter_cache(ctx, all);
else upload_matter_cache(ctx, matter_host, all);

View File

@@ -292,11 +292,12 @@ struct GpuBuffers {
size_t cap_fh3_size;
int prev_nx, prev_ny, prev_nz;
bool initialized;
cudaStream_t stream; /* dedicated transfer stream */
};
static GpuBuffers g_buf = {
nullptr, nullptr, nullptr, nullptr, false, {},
0, 0, 0, 0, 0, 0, false
0, 0, 0, 0, 0, 0, false, nullptr
};
/* Slot assignments — INPUT (H2D) */
@@ -595,6 +596,7 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
|| (fh3_size > g_buf.cap_fh3_size);
if (need_grow) {
if (g_buf.stream) { cudaStreamDestroy(g_buf.stream); g_buf.stream = nullptr; }
if (g_buf.d_mem) { cudaFree(g_buf.d_mem); g_buf.d_mem = nullptr; }
if (g_buf.d_fh2) { cudaFree(g_buf.d_fh2); g_buf.d_fh2 = nullptr; }
if (g_buf.d_fh3) { cudaFree(g_buf.d_fh3); g_buf.d_fh3 = nullptr; }
@@ -622,6 +624,9 @@ static void ensure_gpu_buffers(int nx, int ny, int nz) {
}
}
if (!g_buf.stream)
CUDA_CHECK(cudaStreamCreate(&g_buf.stream));
g_buf.cap_all = all;
g_buf.cap_fh2_size = fh2_size;
g_buf.cap_fh3_size = fh3_size;
@@ -4679,9 +4684,9 @@ static void upload_state_inputs(double **state_host, size_t all)
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, state_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[S_chi], g_buf.h_stage,
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
}
static void upload_matter_cache(StepContext &ctx,
@@ -4692,9 +4697,9 @@ static void upload_matter_cache(StepContext &ctx,
for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
std::memcpy(g_buf.h_stage + (size_t)i * all, matter_host[i], bytes);
}
CUDA_CHECK(cudaMemcpy(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_matter_mem, g_buf.h_stage,
(size_t)BSSN_MATTER_COUNT * bytes,
cudaMemcpyHostToDevice, g_buf.stream));
ctx.matter_ready = true;
}
@@ -5022,9 +5027,11 @@ static void launch_rhs_pipeline(int all, double eps, int co)
static void download_state_outputs(double **state_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_chi_rhs],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < BSSN_STATE_COUNT; ++i) {
std::memcpy(state_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -5033,9 +5040,11 @@ static void download_state_outputs(double **state_host_out, size_t all)
static void download_constraint_outputs(double **constraint_host_out, size_t all)
{
const size_t bytes = all * sizeof(double);
CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaStreamSynchronize(0));
CUDA_CHECK(cudaMemcpyAsync(g_buf.h_stage, g_buf.slot[S_ham_Res],
(size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes,
cudaMemcpyDeviceToHost, g_buf.stream));
CUDA_CHECK(cudaStreamSynchronize(g_buf.stream));
for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) {
std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes);
}
@@ -7306,9 +7315,9 @@ extern "C" int z4c_cuda_rk4_substep(void *block_tag,
g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
}
if (RK4 == 0) {
CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice));
CUDA_CHECK(cudaMemcpyAsync(ctx.d_state0_mem, g_buf.slot[S_chi],
(size_t)BSSN_STATE_COUNT * bytes,
cudaMemcpyDeviceToDevice, g_buf.stream));
}
if (profile) {
cuda_profile_sync();