Checkpoint Z4C CUDA resident sync progress
This commit is contained in:
@@ -424,14 +424,7 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var>
|
|||||||
return false;
|
return false;
|
||||||
if (z4c_cuda_has_resident_state(dst->Bg) == 0)
|
if (z4c_cuda_has_resident_state(dst->Bg) == 0)
|
||||||
return false;
|
return false;
|
||||||
if (type != 1 && VarListd)
|
(void)VarListd;
|
||||||
{
|
|
||||||
double *view_ptrs[Z4C_CUDA_STATE_COUNT];
|
|
||||||
if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
|
|
||||||
return false;
|
|
||||||
if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
#elif USE_CUDA_BSSN
|
#elif USE_CUDA_BSSN
|
||||||
return bssn_cuda_has_resident_state(dst->Bg) != 0;
|
return bssn_cuda_has_resident_state(dst->Bg) != 0;
|
||||||
@@ -460,9 +453,16 @@ bool cuda_direct_pack_segment(double *buffer,
|
|||||||
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||||
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
|
double *views[Z4C_CUDA_STATE_COUNT];
|
||||||
i0, j0, k0,
|
const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
const bool ok = have_views
|
||||||
|
? z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(
|
||||||
|
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||||
|
: z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
#else
|
#else
|
||||||
double *views[BSSN_CUDA_STATE_COUNT];
|
double *views[BSSN_CUDA_STATE_COUNT];
|
||||||
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
|
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
|
||||||
@@ -500,9 +500,16 @@ bool cuda_direct_unpack_segment(double *buffer,
|
|||||||
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||||
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
|
double *views[Z4C_CUDA_STATE_COUNT];
|
||||||
i0, j0, k0,
|
const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
const bool ok = have_views
|
||||||
|
? z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(
|
||||||
|
dst->Bg, views, state_count, buffer, dst->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||||
|
: z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
#else
|
#else
|
||||||
double *views[BSSN_CUDA_STATE_COUNT];
|
double *views[BSSN_CUDA_STATE_COUNT];
|
||||||
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
|
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
|
||||||
@@ -703,40 +710,60 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
|
|||||||
{
|
{
|
||||||
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
|
||||||
bool ok = false;
|
bool ok = false;
|
||||||
|
double *views[Z4C_CUDA_STATE_COUNT];
|
||||||
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
||||||
|
const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
|
||||||
const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
|
const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
|
||||||
if (type == 1)
|
if (type == 1)
|
||||||
{
|
{
|
||||||
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
const int i0 = cuda_seg_begin(dst, src->Bg, 0);
|
||||||
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
const int j0 = cuda_seg_begin(dst, src->Bg, 1);
|
||||||
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
const int k0 = cuda_seg_begin(dst, src->Bg, 2);
|
||||||
ok = z4c_cuda_pack_state_batch_to_device_buffer(
|
ok = have_views
|
||||||
src->Bg, state_count, buffer, src->Bg->shape,
|
? z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(
|
||||||
i0, j0, k0,
|
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||||
|
: z4c_cuda_pack_state_batch_to_device_buffer(
|
||||||
|
src->Bg, state_count, buffer, src->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
}
|
}
|
||||||
else if (type == 2)
|
else if (type == 2)
|
||||||
{
|
{
|
||||||
int first_fine[3];
|
int first_fine[3];
|
||||||
if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
|
if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
|
||||||
return false;
|
return false;
|
||||||
ok = z4c_cuda_restrict_state_batch_to_device_buffer(
|
ok = have_views
|
||||||
src->Bg, state_count, buffer, src->Bg->shape,
|
? z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||||
first_fine[0], first_fine[1], first_fine[2],
|
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||||
have_soa ? soa_flat : 0) == 0;
|
first_fine[0], first_fine[1], first_fine[2],
|
||||||
|
have_soa ? soa_flat : 0) == 0
|
||||||
|
: z4c_cuda_restrict_state_batch_to_device_buffer(
|
||||||
|
src->Bg, state_count, buffer, src->Bg->shape,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||||
|
first_fine[0], first_fine[1], first_fine[2],
|
||||||
|
have_soa ? soa_flat : 0) == 0;
|
||||||
}
|
}
|
||||||
else if (type == 3)
|
else if (type == 3)
|
||||||
{
|
{
|
||||||
int first_fine_ii[3], coarse_lb[3];
|
int first_fine_ii[3], coarse_lb[3];
|
||||||
if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
|
if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
|
||||||
return false;
|
return false;
|
||||||
ok = z4c_cuda_prolong_state_batch_to_device_buffer(
|
ok = have_views
|
||||||
src->Bg, state_count, buffer, src->Bg->shape,
|
? z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2],
|
src->Bg, views, state_count, buffer, src->Bg->shape,
|
||||||
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||||
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||||
have_soa ? soa_flat : 0) == 0;
|
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
||||||
|
have_soa ? soa_flat : 0) == 0
|
||||||
|
: z4c_cuda_prolong_state_batch_to_device_buffer(
|
||||||
|
src->Bg, state_count, buffer, src->Bg->shape,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2],
|
||||||
|
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
|
||||||
|
coarse_lb[0], coarse_lb[1], coarse_lb[2],
|
||||||
|
have_soa ? soa_flat : 0) == 0;
|
||||||
}
|
}
|
||||||
if (sync_profile_enabled())
|
if (sync_profile_enabled())
|
||||||
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
|
||||||
@@ -919,10 +946,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
|
|||||||
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
|
||||||
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
|
||||||
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
|
||||||
const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer(
|
double *views[Z4C_CUDA_STATE_COUNT];
|
||||||
dst->Bg, state_count, buffer, dst->Bg->shape,
|
const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
|
||||||
i0, j0, k0,
|
const bool ok = have_views
|
||||||
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
? z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(
|
||||||
|
dst->Bg, views, state_count, buffer, dst->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
|
||||||
|
: z4c_cuda_unpack_state_batch_from_device_buffer(
|
||||||
|
dst->Bg, state_count, buffer, dst->Bg->shape,
|
||||||
|
i0, j0, k0,
|
||||||
|
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
|
||||||
if (sync_profile_enabled())
|
if (sync_profile_enabled())
|
||||||
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
|
||||||
return ok;
|
return ok;
|
||||||
@@ -1074,23 +1108,39 @@ bool cuda_flush_device_segment_batch(Block *block,
|
|||||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||||
if (state_count == Z4C_CUDA_STATE_COUNT)
|
if (state_count == Z4C_CUDA_STATE_COUNT)
|
||||||
{
|
{
|
||||||
|
double *views[Z4C_CUDA_STATE_COUNT];
|
||||||
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
||||||
|
const bool have_views = cuda_build_z4c_host_views(block, vars, state_count, views);
|
||||||
const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat);
|
const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat);
|
||||||
if (dir == PACK)
|
if (dir == PACK)
|
||||||
{
|
{
|
||||||
if (type == 2)
|
if (type == 2)
|
||||||
return z4c_cuda_restrict_state_segments_to_device_buffer(
|
return have_views
|
||||||
block, state_count, data, block->shape, segment_count,
|
? z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(
|
||||||
meta.data(), have_soa ? soa_flat : 0) == 0;
|
block, views, state_count, data, block->shape, segment_count,
|
||||||
|
meta.data(), have_soa ? soa_flat : 0) == 0
|
||||||
|
: z4c_cuda_restrict_state_segments_to_device_buffer(
|
||||||
|
block, state_count, data, block->shape, segment_count,
|
||||||
|
meta.data(), have_soa ? soa_flat : 0) == 0;
|
||||||
if (type == 3)
|
if (type == 3)
|
||||||
return z4c_cuda_prolong_state_segments_to_device_buffer(
|
return have_views
|
||||||
block, state_count, data, block->shape, segment_count,
|
? z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(
|
||||||
meta.data(), have_soa ? soa_flat : 0) == 0;
|
block, views, state_count, data, block->shape, segment_count,
|
||||||
return z4c_cuda_pack_state_segments_to_device_buffer(
|
meta.data(), have_soa ? soa_flat : 0) == 0
|
||||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
: z4c_cuda_prolong_state_segments_to_device_buffer(
|
||||||
|
block, state_count, data, block->shape, segment_count,
|
||||||
|
meta.data(), have_soa ? soa_flat : 0) == 0;
|
||||||
|
return have_views
|
||||||
|
? z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(
|
||||||
|
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
|
||||||
|
: z4c_cuda_pack_state_segments_to_device_buffer(
|
||||||
|
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||||
}
|
}
|
||||||
return z4c_cuda_unpack_state_segments_from_device_buffer(
|
return have_views
|
||||||
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
? z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(
|
||||||
|
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
|
||||||
|
: z4c_cuda_unpack_state_segments_from_device_buffer(
|
||||||
|
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if USE_CUDA_BSSN
|
#if USE_CUDA_BSSN
|
||||||
@@ -5294,7 +5344,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
|
|||||||
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
|
||||||
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (dir == UNPACK) // from target data to corresponding grid
|
if (dir == UNPACK) // from target data to corresponding grid
|
||||||
{
|
{
|
||||||
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],
|
||||||
|
|||||||
@@ -388,41 +388,57 @@ bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
|||||||
if (z4c_cuda_has_resident_state(block) &&
|
if (z4c_cuda_has_resident_state(block) &&
|
||||||
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
||||||
{
|
{
|
||||||
const int sx = ordn;
|
|
||||||
const int sy = ordn;
|
|
||||||
const int sz = ordn;
|
|
||||||
const int region_all = sx * sy * sz;
|
|
||||||
const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
|
|
||||||
const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
|
|
||||||
const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
|
|
||||||
double *packed_fields = new double[3 * region_all];
|
|
||||||
var *vars[3] = {forx, fory, forz};
|
var *vars[3] = {forx, fory, forz};
|
||||||
for (int f = 0; f < 3; f++)
|
static int use_device_bh_interp = -1;
|
||||||
|
if (use_device_bh_interp < 0)
|
||||||
{
|
{
|
||||||
if (z4c_cuda_pack_state_region_to_host_buffer(block,
|
const char *env = getenv("AMSS_CUDA_Z4C_BH_INTERP_DEVICE");
|
||||||
k_z4c_cuda_bh_state_indices[f],
|
use_device_bh_interp = (env && atoi(env) != 0) ? 1 : 0;
|
||||||
packed_fields + f * region_all,
|
}
|
||||||
block->shape,
|
bool used_device_interp = false;
|
||||||
i0, j0, k0,
|
if (use_device_bh_interp)
|
||||||
sx, sy, sz) != 0)
|
{
|
||||||
|
double soa3[9];
|
||||||
|
for (int f = 0; f < 3; f++)
|
||||||
{
|
{
|
||||||
delete[] packed_fields;
|
soa3[3 * f + 0] = vars[f]->SoA[0];
|
||||||
cout << "CUDA Z4C BH tile download failed" << endl;
|
soa3[3 * f + 1] = vars[f]->SoA[1];
|
||||||
|
soa3[3 * f + 2] = vars[f]->SoA[2];
|
||||||
|
}
|
||||||
|
used_device_interp =
|
||||||
|
(z4c_cuda_interp_state_point3(block, block->shape,
|
||||||
|
k_z4c_cuda_bh_state_indices[0],
|
||||||
|
k_z4c_cuda_bh_state_indices[1],
|
||||||
|
k_z4c_cuda_bh_state_indices[2],
|
||||||
|
block->X[0][0], block->X[1][0], block->X[2][0],
|
||||||
|
DH[0], DH[1], DH[2],
|
||||||
|
x, y, z,
|
||||||
|
interp_ordn, interp_sym,
|
||||||
|
soa3, shellf) == 0);
|
||||||
|
}
|
||||||
|
if (!used_device_interp)
|
||||||
|
{
|
||||||
|
double *shift_views[3] = {
|
||||||
|
block->fgfs[forx->sgfn],
|
||||||
|
block->fgfs[fory->sgfn],
|
||||||
|
block->fgfs[forz->sgfn]};
|
||||||
|
if (z4c_cuda_download_state_subset(block, block->shape, 3,
|
||||||
|
k_z4c_cuda_bh_state_indices,
|
||||||
|
shift_views) != 0)
|
||||||
|
{
|
||||||
|
cout << "CUDA Z4C BH shift download failed" << endl;
|
||||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||||
}
|
}
|
||||||
int tile_shape[3] = {sx, sy, sz};
|
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||||
f_global_interp(tile_shape,
|
block->fgfs[forx->sgfn], shellf[0],
|
||||||
block->X[0] + i0,
|
x, y, z, interp_ordn, forx->SoA, interp_sym);
|
||||||
block->X[1] + j0,
|
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||||
block->X[2] + k0,
|
block->fgfs[fory->sgfn], shellf[1],
|
||||||
packed_fields + f * region_all,
|
x, y, z, interp_ordn, fory->SoA, interp_sym);
|
||||||
shellf[f],
|
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||||
x, y, z,
|
block->fgfs[forz->sgfn], shellf[2],
|
||||||
interp_ordn,
|
x, y, z, interp_ordn, forz->SoA, interp_sym);
|
||||||
vars[f]->SoA,
|
|
||||||
interp_sym);
|
|
||||||
}
|
}
|
||||||
delete[] packed_fields;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -53,6 +53,14 @@ int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *host_buffer,
|
double *host_buffer,
|
||||||
@@ -60,6 +68,14 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *host_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -67,6 +83,14 @@ int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -74,6 +98,14 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
|||||||
int i0, int j0, int k0,
|
int i0, int j0, int k0,
|
||||||
int sx, int sy, int sz);
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
|
int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int i0, int j0, int k0,
|
||||||
|
int sx, int sy, int sz);
|
||||||
|
|
||||||
int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -81,6 +113,14 @@ int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
|||||||
int segment_count,
|
int segment_count,
|
||||||
const int *segment_meta);
|
const int *segment_meta);
|
||||||
|
|
||||||
|
int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta);
|
||||||
|
|
||||||
int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -88,6 +128,14 @@ int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
|||||||
int segment_count,
|
int segment_count,
|
||||||
const int *segment_meta);
|
const int *segment_meta);
|
||||||
|
|
||||||
|
int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta);
|
||||||
|
|
||||||
int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
|
int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -96,6 +144,15 @@ int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
|
|||||||
const int *segment_meta,
|
const int *segment_meta,
|
||||||
const double *state_soa);
|
const double *state_soa);
|
||||||
|
|
||||||
|
int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta,
|
||||||
|
const double *state_soa);
|
||||||
|
|
||||||
int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
|
int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -104,6 +161,15 @@ int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
|
|||||||
const int *segment_meta,
|
const int *segment_meta,
|
||||||
const double *state_soa);
|
const double *state_soa);
|
||||||
|
|
||||||
|
int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int segment_count,
|
||||||
|
const int *segment_meta,
|
||||||
|
const double *state_soa);
|
||||||
|
|
||||||
int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
|
int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -112,6 +178,15 @@ int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
|
|||||||
int fi0, int fj0, int fk0,
|
int fi0, int fj0, int fk0,
|
||||||
const double *state_soa);
|
const double *state_soa);
|
||||||
|
|
||||||
|
int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
int fi0, int fj0, int fk0,
|
||||||
|
const double *state_soa);
|
||||||
|
|
||||||
int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
|
int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
|
||||||
int state_count,
|
int state_count,
|
||||||
double *device_buffer,
|
double *device_buffer,
|
||||||
@@ -121,6 +196,16 @@ int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
|
|||||||
int lbc_i, int lbc_j, int lbc_k,
|
int lbc_i, int lbc_j, int lbc_k,
|
||||||
const double *state_soa);
|
const double *state_soa);
|
||||||
|
|
||||||
|
int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
||||||
|
double **state_host_key,
|
||||||
|
int state_count,
|
||||||
|
double *device_buffer,
|
||||||
|
int *ex,
|
||||||
|
int sx, int sy, int sz,
|
||||||
|
int ii0, int jj0, int kk0,
|
||||||
|
int lbc_i, int lbc_j, int lbc_k,
|
||||||
|
const double *state_soa);
|
||||||
|
|
||||||
int z4c_cuda_download_state_subset(void *block_tag,
|
int z4c_cuda_download_state_subset(void *block_tag,
|
||||||
int *ex,
|
int *ex,
|
||||||
int subset_count,
|
int subset_count,
|
||||||
@@ -138,6 +223,25 @@ int z4c_cuda_compute_constraints_resident(void *block_tag,
|
|||||||
int Symmetry, double eps, int co,
|
int Symmetry, double eps, int co,
|
||||||
double **constraint_host_out);
|
double **constraint_host_out);
|
||||||
|
|
||||||
|
int z4c_cuda_interp_state_point3(void *block_tag,
|
||||||
|
int *ex,
|
||||||
|
int state0,
|
||||||
|
int state1,
|
||||||
|
int state2,
|
||||||
|
double x0,
|
||||||
|
double y0,
|
||||||
|
double z0,
|
||||||
|
double dx,
|
||||||
|
double dy,
|
||||||
|
double dz,
|
||||||
|
double px,
|
||||||
|
double py,
|
||||||
|
double pz,
|
||||||
|
int ordn,
|
||||||
|
int symmetry,
|
||||||
|
const double *soa3,
|
||||||
|
double *out3);
|
||||||
|
|
||||||
int z4c_cuda_download_constraint_outputs(int *ex,
|
int z4c_cuda_download_constraint_outputs(int *ex,
|
||||||
double **constraint_host_out);
|
double **constraint_host_out);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user