From b0dd069a2b8bd753fb97aa008ab1a53c5e33a192 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Thu, 9 Apr 2026 18:36:10 +0800 Subject: [PATCH] Register GPU transfer buffers as pinned host memory --- AMSS_NCKU_source/bssn_cuda_ops.cu | 10 ++- AMSS_NCKU_source/bssn_gpu.cu | 118 ++++++++++++++++++++++++++++++ AMSS_NCKU_source/bssn_gpu.h | 1 + 3 files changed, 128 insertions(+), 1 deletion(-) diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index b5fb35a..f619d15 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -789,6 +789,11 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, double *stage_ptr = nullptr; const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr; + bssn_gpu_prepare_host_buffer(state0, n); + if (need_boundary_input) bssn_gpu_prepare_host_buffer(boundary_src, n); + if (need_stage_input) bssn_gpu_prepare_host_buffer(stage_data, n); + bssn_gpu_prepare_host_buffer(rhs_accum, n); + ok = ok && (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) && (!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) && @@ -931,6 +936,7 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_ho bssn_gpu_register_device_buffer(chi, device_chi); if (download_to_host) { + bssn_gpu_prepare_host_buffer(chi, n); cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); ok = err == cudaSuccess; @@ -945,7 +951,9 @@ int bssn_cuda_download_buffer(int *ex, double *host_ptr) if (!device_ptr) return 1; - const size_t bytes = static_cast(count_points(ex)) * sizeof(double); + const int n = count_points(ex); + bssn_gpu_prepare_host_buffer(host_ptr, n); + const size_t bytes = static_cast(n) * sizeof(double); cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) { diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 65e17e9..efcef1f 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -159,6 +159,16 @@ struct OwnedBufferRegistry int mapped_buffer_count = 0; }; +struct PinnedHostRegistry +{ + static const int max_buffers = 512; + const double *host_buffers[max_buffers] = {nullptr}; + size_t capacities[max_buffers] = {0}; + bool registered[max_buffers] = {false}; + bool failed[max_buffers] = {false}; + int buffer_count = 0; +}; + GpuRhsCache &gpu_rhs_cache() { static GpuRhsCache cache; @@ -177,6 +187,12 @@ OwnedBufferRegistry &owned_buffer_registry() return registry; } +PinnedHostRegistry &pinned_host_registry() +{ + static thread_local PinnedHostRegistry registry; + return registry; +} + void reset_meta(Meta *meta) { memset(meta, 0, sizeof(Meta)); @@ -354,6 +370,61 @@ bool prepare_owned_buffer(const double *host_ptr, size_t count, bool zero_fill) return true; } +int find_pinned_host_slot(PinnedHostRegistry ®istry, const double *host_ptr) +{ + for (int i = 0; i < registry.buffer_count; ++i) + { + if (registry.host_buffers[i] == host_ptr) + return i; + } + + if (registry.buffer_count >= PinnedHostRegistry::max_buffers) + return -1; + + const int slot = registry.buffer_count++; + registry.host_buffers[slot] = host_ptr; + registry.capacities[slot] = 0; + registry.registered[slot] = false; + registry.failed[slot] = false; + return slot; +} + +void ensure_host_buffer_registered(const double *host_ptr, size_t bytes) +{ + if (!host_ptr || bytes == 0) + return; + + PinnedHostRegistry ®istry = pinned_host_registry(); + const int slot = find_pinned_host_slot(registry, host_ptr); + if (slot < 0) + return; + + if (registry.registered[slot] && registry.capacities[slot] >= bytes) + return; + if (registry.failed[slot] && registry.capacities[slot] >= bytes) + return; + + if (registry.registered[slot]) + { + cudaError_t unreg_err = cudaHostUnregister(const_cast(registry.host_buffers[slot])); + if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered) + cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl; + registry.registered[slot] = false; + } + + cudaError_t err = cudaHostRegister(const_cast(host_ptr), bytes, cudaHostRegisterPortable); + if (err == cudaSuccess || err == cudaErrorHostMemoryAlreadyRegistered) + { + registry.registered[slot] = true; + registry.failed[slot] = false; + registry.capacities[slot] = bytes; + return; + } + + registry.failed[slot] = true; + registry.capacities[slot] = bytes; +} + bool ensure_device_buffer(double **ptr, size_t count) { if (*ptr) @@ -462,6 +533,7 @@ void cleanup_gpu_rhs_cache() { GpuRhsCache &cache = gpu_rhs_cache(); OwnedBufferRegistry &owned = owned_buffer_registry(); + PinnedHostRegistry &pinned = pinned_host_registry(); if (!cache.allocated) { for (int i = 0; i < owned.mapped_buffer_count; ++i) @@ -478,6 +550,20 @@ void cleanup_gpu_rhs_cache() owned.host_buffers[i] = nullptr; } owned.mapped_buffer_count = 0; + for (int i = 0; i < pinned.buffer_count; ++i) + { + if (pinned.registered[i] && pinned.host_buffers[i]) + { + cudaError_t unreg_err = cudaHostUnregister(const_cast(pinned.host_buffers[i])); + if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered) + cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl; + } + pinned.host_buffers[i] = nullptr; + pinned.capacities[i] = 0; + pinned.registered[i] = false; + pinned.failed[i] = false; + } + pinned.buffer_count = 0; return; } @@ -509,6 +595,20 @@ void cleanup_gpu_rhs_cache() owned.host_buffers[i] = nullptr; } owned.mapped_buffer_count = 0; + for (int i = 0; i < pinned.buffer_count; ++i) + { + if (pinned.registered[i] && pinned.host_buffers[i]) + { + cudaError_t unreg_err = cudaHostUnregister(const_cast(pinned.host_buffers[i])); + if (unreg_err != cudaSuccess && unreg_err != cudaErrorHostMemoryNotRegistered) + cerr << "cudaHostUnregister failed: " << cudaGetErrorString(unreg_err) << endl; + } + pinned.host_buffers[i] = nullptr; + pinned.capacities[i] = 0; + pinned.registered[i] = false; + pinned.failed[i] = false; + } + pinned.buffer_count = 0; } bool register_gpu_rhs_cleanup() @@ -837,13 +937,21 @@ void bssn_gpu_register_device_buffer(const double *host_ptr, const double *devic map_external_buffer(external_buffer_registry(), host_ptr, device_ptr); } +void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count) +{ + if (count > 0) + ensure_host_buffer_registered(host_ptr, static_cast(count) * sizeof(double)); +} + int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count) { + bssn_gpu_prepare_host_buffer(host_ptr, count); return prepare_owned_buffer(host_ptr, static_cast(count), false) ? 0 : 1; } int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count) { + bssn_gpu_prepare_host_buffer(host_ptr, count); return prepare_owned_buffer(host_ptr, static_cast(count), true) ? 0 : 1; } @@ -861,6 +969,11 @@ int bssn_gpu_stage_upload_region(const double *host_ptr, if (!device_ptr) return 1; + int full_count = 1; + for (int i = 0; i < 3; ++i) + full_count *= full_shape[i]; + bssn_gpu_prepare_host_buffer(host_ptr, full_count); + int start[3] = {0, 0, 0}; for (int i = 0; i < 3; ++i) { @@ -928,6 +1041,11 @@ int bssn_gpu_stage_download_region(double *host_ptr, if (!device_ptr) return 1; + int full_count = 1; + for (int i = 0; i < 3; ++i) + full_count *= full_shape[i]; + bssn_gpu_prepare_host_buffer(host_ptr, full_count); + int start[3] = {0, 0, 0}; for (int i = 0; i < 3; ++i) { diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index 3dd890c..d4561e8 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -69,6 +69,7 @@ int bssn_gpu_bind_process_device(int mpi_rank); void bssn_gpu_clear_cached_device_buffers(); const double *bssn_gpu_find_device_buffer(const double *host_ptr); void bssn_gpu_register_device_buffer(const double *host_ptr, const double *device_ptr); +void bssn_gpu_prepare_host_buffer(const double *host_ptr, int count); int bssn_gpu_stage_upload_buffer(const double *host_ptr, int count); int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count); int bssn_gpu_stage_upload_region(const double *host_ptr,