Batch GPU stage downloads
This commit is contained in:
@@ -317,6 +317,8 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
|
|
||||||
auto stage_download_var_list =
|
auto stage_download_var_list =
|
||||||
[&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
|
[&](Block *cg, MyList<var> *var_list, bool skip_unmapped) {
|
||||||
|
std::vector<double *> batch_host_ptrs;
|
||||||
|
std::vector<MyList<var> *> batch_vars;
|
||||||
while (var_list)
|
while (var_list)
|
||||||
{
|
{
|
||||||
double *host_ptr = cg->fgfs[var_list->data->sgfn];
|
double *host_ptr = cg->fgfs[var_list->data->sgfn];
|
||||||
@@ -325,18 +327,28 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN)
|
|||||||
var_list = var_list->next;
|
var_list = var_list->next;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (bssn_cuda_download_buffer(cg->shape, host_ptr))
|
batch_host_ptrs.push_back(host_ptr);
|
||||||
{
|
batch_vars.push_back(var_list);
|
||||||
cerr << "GPU stage download failure: lev=" << lev
|
|
||||||
<< " var=" << var_list->data->name
|
|
||||||
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
|
||||||
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
|
||||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
|
||||||
ERROR = 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
var_list = var_list->next;
|
var_list = var_list->next;
|
||||||
}
|
}
|
||||||
|
if (!batch_host_ptrs.empty() &&
|
||||||
|
bssn_gpu_download_buffer_batch(cg->shape, &batch_host_ptrs[0],
|
||||||
|
static_cast<int>(batch_host_ptrs.size())))
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < batch_host_ptrs.size(); ++i)
|
||||||
|
{
|
||||||
|
if (bssn_cuda_download_buffer(cg->shape, batch_host_ptrs[i]))
|
||||||
|
{
|
||||||
|
cerr << "GPU stage download failure: lev=" << lev
|
||||||
|
<< " var=" << batch_vars[i]->data->name
|
||||||
|
<< " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||||
|
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||||
|
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||||
|
ERROR = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
auto stage_download_patch_list =
|
auto stage_download_patch_list =
|
||||||
|
|||||||
@@ -1103,6 +1103,77 @@ int bssn_gpu_stage_zero_buffer(const double *host_ptr, int count)
|
|||||||
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
|
return prepare_owned_buffer(host_ptr, static_cast<size_t>(count), true) ? 0 : 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers)
|
||||||
|
{
|
||||||
|
if (!ex || !host_ptrs || num_buffers <= 0)
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
static thread_local cudaStream_t stream = nullptr;
|
||||||
|
static thread_local cudaEvent_t ready = nullptr;
|
||||||
|
if (!stream)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaStreamCreateWithFlags failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!ready)
|
||||||
|
{
|
||||||
|
cudaError_t err = cudaEventCreateWithFlags(&ready, cudaEventDisableTiming);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaEventCreateWithFlags failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cudaError_t err = cudaEventRecord(ready, 0);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaEventRecord download readiness failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
err = cudaStreamWaitEvent(stream, ready, 0);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaStreamWaitEvent download readiness failed: " << cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int n = 1;
|
||||||
|
for (int i = 0; i < 3; ++i)
|
||||||
|
n *= ex[i];
|
||||||
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
||||||
|
|
||||||
|
for (int i = 0; i < num_buffers; ++i)
|
||||||
|
{
|
||||||
|
double *host_ptr = host_ptrs[i];
|
||||||
|
if (!host_ptr)
|
||||||
|
return 1;
|
||||||
|
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
||||||
|
if (!device_ptr)
|
||||||
|
return 1;
|
||||||
|
bssn_gpu_prepare_host_buffer(host_ptr, n);
|
||||||
|
err = cudaMemcpyAsync(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost, stream);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaMemcpyAsync(D2H) buffered batch download failed: "
|
||||||
|
<< cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
err = cudaStreamSynchronize(stream);
|
||||||
|
if (err != cudaSuccess)
|
||||||
|
{
|
||||||
|
cerr << "cudaStreamSynchronize buffered batch download failed: "
|
||||||
|
<< cudaGetErrorString(err) << endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
int bssn_gpu_stage_upload_region(const double *host_ptr,
|
int bssn_gpu_stage_upload_region(const double *host_ptr,
|
||||||
const int *full_shape,
|
const int *full_shape,
|
||||||
const double *full_llb,
|
const double *full_llb,
|
||||||
|
|||||||
@@ -99,6 +99,7 @@ int bssn_gpu_stage_upload_buffer_to_region(const double *host_src_ptr,
|
|||||||
const double *full_uub,
|
const double *full_uub,
|
||||||
const int *region_shape,
|
const int *region_shape,
|
||||||
const double *region_llb);
|
const double *region_llb);
|
||||||
|
int bssn_gpu_download_buffer_batch(const int *ex, double **host_ptrs, int num_buffers);
|
||||||
|
|
||||||
/** Init GPU side data in GPUMeta. */
|
/** Init GPU side data in GPUMeta. */
|
||||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||||
|
|||||||
Reference in New Issue
Block a user