Checkpoint Z4C CUDA resident sync progress

This commit is contained in:
2026-05-02 10:53:52 +08:00
parent ba61702fc0
commit 52beb4d153
4 changed files with 1075 additions and 134 deletions

View File

@@ -424,14 +424,7 @@ bool cuda_can_direct_unpack(const Parallel::gridseg *dst, int type, MyList<var>
return false; return false;
if (z4c_cuda_has_resident_state(dst->Bg) == 0) if (z4c_cuda_has_resident_state(dst->Bg) == 0)
return false; return false;
if (type != 1 && VarListd) (void)VarListd;
{
double *view_ptrs[Z4C_CUDA_STATE_COUNT];
if (!cuda_build_z4c_host_views(dst->Bg, VarListd, Z4C_CUDA_STATE_COUNT, view_ptrs))
return false;
if (z4c_cuda_resident_state_matches(dst->Bg, view_ptrs) == 0)
return false;
}
return true; return true;
#elif USE_CUDA_BSSN #elif USE_CUDA_BSSN
return bssn_cuda_has_resident_state(dst->Bg) != 0; return bssn_cuda_has_resident_state(dst->Bg) != 0;
@@ -460,9 +453,16 @@ bool cuda_direct_pack_segment(double *buffer,
const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int j0 = cuda_seg_begin(dst, src->Bg, 1);
const int k0 = cuda_seg_begin(dst, src->Bg, 2); const int k0 = cuda_seg_begin(dst, src->Bg, 2);
#if USE_CUDA_Z4C && (ABEtype == 2) #if USE_CUDA_Z4C && (ABEtype == 2)
const bool ok = z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape, double *views[Z4C_CUDA_STATE_COUNT];
i0, j0, k0, const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
dst->shape[0], dst->shape[1], dst->shape[2]) == 0; const bool ok = have_views
? z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(
src->Bg, views, state_count, buffer, src->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
: z4c_cuda_pack_state_batch_to_host_buffer(src->Bg, state_count, buffer, src->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else #else
double *views[BSSN_CUDA_STATE_COUNT]; double *views[BSSN_CUDA_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views); const bool have_views = cuda_build_bssn_host_views(src->Bg, VarLists, state_count, views);
@@ -500,9 +500,16 @@ bool cuda_direct_unpack_segment(double *buffer,
const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
const int k0 = cuda_seg_begin(dst, dst->Bg, 2); const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
#if USE_CUDA_Z4C && (ABEtype == 2) #if USE_CUDA_Z4C && (ABEtype == 2)
const bool ok = z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape, double *views[Z4C_CUDA_STATE_COUNT];
i0, j0, k0, const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
dst->shape[0], dst->shape[1], dst->shape[2]) == 0; const bool ok = have_views
? z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(
dst->Bg, views, state_count, buffer, dst->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
: z4c_cuda_unpack_state_batch_from_host_buffer(dst->Bg, state_count, buffer, dst->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
#else #else
double *views[BSSN_CUDA_STATE_COUNT]; double *views[BSSN_CUDA_STATE_COUNT];
const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views); const bool have_views = cuda_build_bssn_host_views(dst->Bg, VarListd, state_count, views);
@@ -703,40 +710,60 @@ bool cuda_direct_pack_segment_to_device(double *buffer,
{ {
const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0; const double t0 = sync_profile_enabled() ? MPI_Wtime() : 0.0;
bool ok = false; bool ok = false;
double *views[Z4C_CUDA_STATE_COUNT];
double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
const bool have_views = cuda_build_z4c_host_views(src->Bg, VarLists, state_count, views);
const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat); const bool have_soa = cuda_build_state_soa(VarLists, state_count, soa_flat);
if (type == 1) if (type == 1)
{ {
const int i0 = cuda_seg_begin(dst, src->Bg, 0); const int i0 = cuda_seg_begin(dst, src->Bg, 0);
const int j0 = cuda_seg_begin(dst, src->Bg, 1); const int j0 = cuda_seg_begin(dst, src->Bg, 1);
const int k0 = cuda_seg_begin(dst, src->Bg, 2); const int k0 = cuda_seg_begin(dst, src->Bg, 2);
ok = z4c_cuda_pack_state_batch_to_device_buffer( ok = have_views
src->Bg, state_count, buffer, src->Bg->shape, ? z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(
i0, j0, k0, src->Bg, views, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0; i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
: z4c_cuda_pack_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
} }
else if (type == 2) else if (type == 2)
{ {
int first_fine[3]; int first_fine[3];
if (!cuda_cell_gw3_restrict_params(src, dst, first_fine)) if (!cuda_cell_gw3_restrict_params(src, dst, first_fine))
return false; return false;
ok = z4c_cuda_restrict_state_batch_to_device_buffer( ok = have_views
src->Bg, state_count, buffer, src->Bg->shape, ? z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(
dst->shape[0], dst->shape[1], dst->shape[2], src->Bg, views, state_count, buffer, src->Bg->shape,
first_fine[0], first_fine[1], first_fine[2], dst->shape[0], dst->shape[1], dst->shape[2],
have_soa ? soa_flat : 0) == 0; first_fine[0], first_fine[1], first_fine[2],
have_soa ? soa_flat : 0) == 0
: z4c_cuda_restrict_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine[0], first_fine[1], first_fine[2],
have_soa ? soa_flat : 0) == 0;
} }
else if (type == 3) else if (type == 3)
{ {
int first_fine_ii[3], coarse_lb[3]; int first_fine_ii[3], coarse_lb[3];
if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb)) if (!cuda_cell_gw3_prolong_params(src, dst, first_fine_ii, coarse_lb))
return false; return false;
ok = z4c_cuda_prolong_state_batch_to_device_buffer( ok = have_views
src->Bg, state_count, buffer, src->Bg->shape, ? z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(
dst->shape[0], dst->shape[1], dst->shape[2], src->Bg, views, state_count, buffer, src->Bg->shape,
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2], dst->shape[0], dst->shape[1], dst->shape[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2], first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
have_soa ? soa_flat : 0) == 0; coarse_lb[0], coarse_lb[1], coarse_lb[2],
have_soa ? soa_flat : 0) == 0
: z4c_cuda_prolong_state_batch_to_device_buffer(
src->Bg, state_count, buffer, src->Bg->shape,
dst->shape[0], dst->shape[1], dst->shape[2],
first_fine_ii[0], first_fine_ii[1], first_fine_ii[2],
coarse_lb[0], coarse_lb[1], coarse_lb[2],
have_soa ? soa_flat : 0) == 0;
} }
if (sync_profile_enabled()) if (sync_profile_enabled())
sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0; sync_profile_stats().direct_pack_sec += MPI_Wtime() - t0;
@@ -919,10 +946,17 @@ bool cuda_direct_unpack_segment_from_device(double *buffer,
const int i0 = cuda_seg_begin(dst, dst->Bg, 0); const int i0 = cuda_seg_begin(dst, dst->Bg, 0);
const int j0 = cuda_seg_begin(dst, dst->Bg, 1); const int j0 = cuda_seg_begin(dst, dst->Bg, 1);
const int k0 = cuda_seg_begin(dst, dst->Bg, 2); const int k0 = cuda_seg_begin(dst, dst->Bg, 2);
const bool ok = z4c_cuda_unpack_state_batch_from_device_buffer( double *views[Z4C_CUDA_STATE_COUNT];
dst->Bg, state_count, buffer, dst->Bg->shape, const bool have_views = cuda_build_z4c_host_views(dst->Bg, VarListd, state_count, views);
i0, j0, k0, const bool ok = have_views
dst->shape[0], dst->shape[1], dst->shape[2]) == 0; ? z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(
dst->Bg, views, state_count, buffer, dst->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0
: z4c_cuda_unpack_state_batch_from_device_buffer(
dst->Bg, state_count, buffer, dst->Bg->shape,
i0, j0, k0,
dst->shape[0], dst->shape[1], dst->shape[2]) == 0;
if (sync_profile_enabled()) if (sync_profile_enabled())
sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0; sync_profile_stats().direct_unpack_sec += MPI_Wtime() - t0;
return ok; return ok;
@@ -1074,23 +1108,39 @@ bool cuda_flush_device_segment_batch(Block *block,
#if USE_CUDA_Z4C && (ABEtype == 2) #if USE_CUDA_Z4C && (ABEtype == 2)
if (state_count == Z4C_CUDA_STATE_COUNT) if (state_count == Z4C_CUDA_STATE_COUNT)
{ {
double *views[Z4C_CUDA_STATE_COUNT];
double soa_flat[3 * Z4C_CUDA_STATE_COUNT]; double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
const bool have_views = cuda_build_z4c_host_views(block, vars, state_count, views);
const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat); const bool have_soa = cuda_build_state_soa(vars, state_count, soa_flat);
if (dir == PACK) if (dir == PACK)
{ {
if (type == 2) if (type == 2)
return z4c_cuda_restrict_state_segments_to_device_buffer( return have_views
block, state_count, data, block->shape, segment_count, ? z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(
meta.data(), have_soa ? soa_flat : 0) == 0; block, views, state_count, data, block->shape, segment_count,
meta.data(), have_soa ? soa_flat : 0) == 0
: z4c_cuda_restrict_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count,
meta.data(), have_soa ? soa_flat : 0) == 0;
if (type == 3) if (type == 3)
return z4c_cuda_prolong_state_segments_to_device_buffer( return have_views
block, state_count, data, block->shape, segment_count, ? z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(
meta.data(), have_soa ? soa_flat : 0) == 0; block, views, state_count, data, block->shape, segment_count,
return z4c_cuda_pack_state_segments_to_device_buffer( meta.data(), have_soa ? soa_flat : 0) == 0
block, state_count, data, block->shape, segment_count, meta.data()) == 0; : z4c_cuda_prolong_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count,
meta.data(), have_soa ? soa_flat : 0) == 0;
return have_views
? z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
: z4c_cuda_pack_state_segments_to_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
} }
return z4c_cuda_unpack_state_segments_from_device_buffer( return have_views
block, state_count, data, block->shape, segment_count, meta.data()) == 0; ? z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(
block, views, state_count, data, block->shape, segment_count, meta.data()) == 0
: z4c_cuda_unpack_state_segments_from_device_buffer(
block, state_count, data, block->shape, segment_count, meta.data()) == 0;
} }
#endif #endif
#if USE_CUDA_BSSN #if USE_CUDA_BSSN
@@ -5294,7 +5344,7 @@ int Parallel::data_packer(double *data, MyList<Parallel::gridseg> *src, MyList<P
dst->data->llb, dst->data->uub, dst->data->shape, data + size_out, dst->data->llb, dst->data->uub, dst->data->shape, data + size_out,
dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry); dst->data->llb, dst->data->uub, varls->data->SoA, Symmetry);
} }
} }
if (dir == UNPACK) // from target data to corresponding grid if (dir == UNPACK) // from target data to corresponding grid
{ {
f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn], f_copy(DIM, dst->data->Bg->bbox, dst->data->Bg->bbox + dim, dst->data->Bg->shape, dst->data->Bg->fgfs[varld->data->sgfn],

View File

@@ -388,41 +388,57 @@ bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
if (z4c_cuda_has_resident_state(block) && if (z4c_cuda_has_resident_state(block) &&
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn) block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{ {
const int sx = ordn;
const int sy = ordn;
const int sz = ordn;
const int region_all = sx * sy * sz;
const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
double *packed_fields = new double[3 * region_all];
var *vars[3] = {forx, fory, forz}; var *vars[3] = {forx, fory, forz};
for (int f = 0; f < 3; f++) static int use_device_bh_interp = -1;
if (use_device_bh_interp < 0)
{ {
if (z4c_cuda_pack_state_region_to_host_buffer(block, const char *env = getenv("AMSS_CUDA_Z4C_BH_INTERP_DEVICE");
k_z4c_cuda_bh_state_indices[f], use_device_bh_interp = (env && atoi(env) != 0) ? 1 : 0;
packed_fields + f * region_all, }
block->shape, bool used_device_interp = false;
i0, j0, k0, if (use_device_bh_interp)
sx, sy, sz) != 0) {
double soa3[9];
for (int f = 0; f < 3; f++)
{ {
delete[] packed_fields; soa3[3 * f + 0] = vars[f]->SoA[0];
cout << "CUDA Z4C BH tile download failed" << endl; soa3[3 * f + 1] = vars[f]->SoA[1];
soa3[3 * f + 2] = vars[f]->SoA[2];
}
used_device_interp =
(z4c_cuda_interp_state_point3(block, block->shape,
k_z4c_cuda_bh_state_indices[0],
k_z4c_cuda_bh_state_indices[1],
k_z4c_cuda_bh_state_indices[2],
block->X[0][0], block->X[1][0], block->X[2][0],
DH[0], DH[1], DH[2],
x, y, z,
interp_ordn, interp_sym,
soa3, shellf) == 0);
}
if (!used_device_interp)
{
double *shift_views[3] = {
block->fgfs[forx->sgfn],
block->fgfs[fory->sgfn],
block->fgfs[forz->sgfn]};
if (z4c_cuda_download_state_subset(block, block->shape, 3,
k_z4c_cuda_bh_state_indices,
shift_views) != 0)
{
cout << "CUDA Z4C BH shift download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1); MPI_Abort(MPI_COMM_WORLD, 1);
} }
int tile_shape[3] = {sx, sy, sz}; f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
f_global_interp(tile_shape, block->fgfs[forx->sgfn], shellf[0],
block->X[0] + i0, x, y, z, interp_ordn, forx->SoA, interp_sym);
block->X[1] + j0, f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->X[2] + k0, block->fgfs[fory->sgfn], shellf[1],
packed_fields + f * region_all, x, y, z, interp_ordn, fory->SoA, interp_sym);
shellf[f], f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
x, y, z, block->fgfs[forz->sgfn], shellf[2],
interp_ordn, x, y, z, interp_ordn, forz->SoA, interp_sym);
vars[f]->SoA,
interp_sym);
} }
delete[] packed_fields;
} }
else else
{ {

File diff suppressed because it is too large Load Diff

View File

@@ -53,6 +53,14 @@ int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int i0, int j0, int k0, int i0, int j0, int k0,
int sx, int sy, int sz); int sx, int sy, int sz);
int z4c_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag, int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
int state_count, int state_count,
double *host_buffer, double *host_buffer,
@@ -60,6 +68,14 @@ int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
int i0, int j0, int k0, int i0, int j0, int k0,
int sx, int sy, int sz); int sx, int sy, int sz);
int z4c_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag, int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -67,6 +83,14 @@ int z4c_cuda_pack_state_batch_to_device_buffer(void *block_tag,
int i0, int j0, int k0, int i0, int j0, int k0,
int sx, int sy, int sz); int sx, int sy, int sz);
int z4c_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag, int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -74,6 +98,14 @@ int z4c_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
int i0, int j0, int k0, int i0, int j0, int k0,
int sx, int sy, int sz); int sx, int sy, int sz);
int z4c_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag, int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -81,6 +113,14 @@ int z4c_cuda_pack_state_segments_to_device_buffer(void *block_tag,
int segment_count, int segment_count,
const int *segment_meta); const int *segment_meta);
int z4c_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag, int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -88,6 +128,14 @@ int z4c_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
int segment_count, int segment_count,
const int *segment_meta); const int *segment_meta);
int z4c_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag, int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -96,6 +144,15 @@ int z4c_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
const int *segment_meta, const int *segment_meta,
const double *state_soa); const double *state_soa);
int z4c_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta,
const double *state_soa);
int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag, int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -104,6 +161,15 @@ int z4c_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
const int *segment_meta, const int *segment_meta,
const double *state_soa); const double *state_soa);
int z4c_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta,
const double *state_soa);
int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag, int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -112,6 +178,15 @@ int z4c_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
int fi0, int fj0, int fk0, int fi0, int fj0, int fk0,
const double *state_soa); const double *state_soa);
int z4c_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int sx, int sy, int sz,
int fi0, int fj0, int fk0,
const double *state_soa);
int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag, int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
int state_count, int state_count,
double *device_buffer, double *device_buffer,
@@ -121,6 +196,16 @@ int z4c_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
int lbc_i, int lbc_j, int lbc_k, int lbc_i, int lbc_j, int lbc_k,
const double *state_soa); const double *state_soa);
int z4c_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int sx, int sy, int sz,
int ii0, int jj0, int kk0,
int lbc_i, int lbc_j, int lbc_k,
const double *state_soa);
int z4c_cuda_download_state_subset(void *block_tag, int z4c_cuda_download_state_subset(void *block_tag,
int *ex, int *ex,
int subset_count, int subset_count,
@@ -138,6 +223,25 @@ int z4c_cuda_compute_constraints_resident(void *block_tag,
int Symmetry, double eps, int co, int Symmetry, double eps, int co,
double **constraint_host_out); double **constraint_host_out);
int z4c_cuda_interp_state_point3(void *block_tag,
int *ex,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
const double *soa3,
double *out3);
int z4c_cuda_download_constraint_outputs(int *ex, int z4c_cuda_download_constraint_outputs(int *ex,
double **constraint_host_out); double **constraint_host_out);