diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index 83fd6f5..303a057 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -317,6 +317,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) auto stage_download_var_list = [&](Block *cg, MyList *var_list, bool skip_unmapped) { + std::vector batch_host_ptrs; + std::vector *> batch_vars; while (var_list) { double *host_ptr = cg->fgfs[var_list->data->sgfn]; @@ -325,18 +327,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) var_list = var_list->next; continue; } - if (bssn_cuda_download_buffer(cg->shape, host_ptr)) - { - cerr << "GPU stage download failure: lev=" << lev - << " var=" << var_list->data->name - << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - break; - } + batch_host_ptrs.push_back(host_ptr); + batch_vars.push_back(var_list); var_list = var_list->next; } + if (!batch_host_ptrs.empty() && + bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0], + static_cast(batch_host_ptrs.size()))) + { + for (size_t i = 0; i < batch_host_ptrs.size(); ++i) + { + if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i])) + { + cerr << "GPU stage download failure: lev=" << lev + << " var=" << batch_vars[i]->data->name + << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + break; + } + } + } }; auto stage_download_patch_list = diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 931d3a0..5fef7ec 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -1103,6 +1103,77 @@ int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count) return prepare_owned_buffer(host_ptr, static_cast(count), true) ? 0 : 1; } +int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers) +{ + if (!ex || !host_ptrs || num_buffers <= 0) + return 1; + + static thread_local cudaStream_t stream = nullptr; + static thread_local cudaEvent_t ready = nullptr; + if (!stream) + { + cudaError_t err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + if (err != cudaSuccess) + { + cerr << "cudaStreamCreateWithFlags failed: " << cudaGetErrorString(err) << endl; + return 1; + } + } + if (!ready) + { + cudaError_t err = cudaEventCreateWithFlags(&ready, cudaEventDisableTiming); + if (err != cudaSuccess) + { + cerr << "cudaEventCreateWithFlags failed: " << cudaGetErrorString(err) << endl; + return 1; + } + } + cudaError_t err = cudaEventRecord(ready, 0); + if (err != cudaSuccess) + { + cerr << "cudaEventRecord download readiness failed: " << cudaGetErrorString(err) << endl; + return 1; + } + err = cudaStreamWaitEvent(stream, ready, 0); + if (err != cudaSuccess) + { + cerr << "cudaStreamWaitEvent download readiness failed: " << cudaGetErrorString(err) << endl; + return 1; + } + + int n = 1; + for (int i = 0; i < 3; ++i) + n *= ex[i]; + const size_t bytes = static_cast(n) * sizeof(double); + + for (int i = 0; i < num_buffers; ++i) + { + double *host_ptr = host_ptrs[i]; + if (!host_ptr) + return 1; + const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr); + if (!device_ptr) + return 1; + bssn_gpu_prepare_host_buffer(host_ptr, n); + err = cudaMemcpyAsync(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost, stream); + if (err != cudaSuccess) + { + cerr << "cudaMemcpyAsync(D2H) buffered batch download failed: " + << cudaGetErrorString(err) << endl; + return 1; + } + } + + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) + { + cerr << "cudaStreamSynchronize buffered batch download failed: " + << cudaGetErrorString(err) << endl; + return 1; + } + return 0; +} + int bssn_gpu_stage_upload_region(const double *host_ptr, const int *full_shape, const double *full_llb, diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h index 5a3337d..2248589 100644 --- a/AMSS_NCKU_source/bssn_gpu.h +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -99,6 +99,7 @@ int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr, const double *full_uub, const int *region_shape, const double *region_llb); +int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers); /** Init GPU side data in GPUMeta. */ // void init_fluid_meta_gpu(GPUMeta *gpu_meta);