Remove the compile-time #error that blocked USE_CUDA_Z4C + WithShell. Add GPU-to-CPU state sync at the start of both Z4C Step functions (non-CPBC and CPBC) so shell CPU consumers read valid field data after Cartesian GPU RHS with resident state. Move bssn_cuda_use_resident_sync and bssn_cuda_download_level_state _if_present from anonymous namespace to file scope in bssn_class.C so derived classes (Z4C) can call them. Declare both in bssn_rhs_cuda.h. Include bssn_rhs_cuda.h in Z4c_class.C. Z4C shell RHS remains on CPU (Fortran Z4c_rhs_ss.f90) pending future GPU kernel implementation. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
414 lines
23 KiB
C
414 lines
23 KiB
C
#ifndef BSSN_RHS_CUDA_H
|
|
#define BSSN_RHS_CUDA_H
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
enum {
|
|
BSSN_CUDA_STATE_COUNT = 24,
|
|
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
|
|
BSSN_EM_CUDA_STATE_COUNT = 32,
|
|
BSSN_EM_CUDA_SOURCE_COUNT = 4,
|
|
BSSN_CUDA_MATTER_COUNT = 10
|
|
};
|
|
|
|
int f_compute_rhs_bssn(int *ex, double &T,
|
|
double *X, double *Y, double *Z,
|
|
double *chi, double *trK,
|
|
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
|
double *Gamx, double *Gamy, double *Gamz,
|
|
double *Lap, double *betax, double *betay, double *betaz,
|
|
double *dtSfx, double *dtSfy, double *dtSfz,
|
|
double *chi_rhs, double *trK_rhs,
|
|
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
|
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
|
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
|
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
|
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
|
double *rho, double *Sx, double *Sy, double *Sz,
|
|
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
|
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
|
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
|
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
|
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
|
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
|
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
|
int &Symmetry, int &Lev, double &eps, int &co);
|
|
|
|
int bssn_cuda_rk4_substep(void *block_tag,
|
|
int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **state_host_out,
|
|
double **matter_host,
|
|
const double *propspeed,
|
|
const double *soa_flat,
|
|
const double *bbox,
|
|
double &dT,
|
|
double &T,
|
|
int &RK4,
|
|
int &apply_bam_bc,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps,
|
|
int &co,
|
|
int &use_zero_matter,
|
|
int &keep_resident_state,
|
|
int &apply_enforce_ga,
|
|
double &chitiny);
|
|
|
|
int bssn_escalar_cuda_rk4_substep(void *block_tag,
|
|
int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **state_host_out,
|
|
const double *propspeed,
|
|
const double *soa_flat,
|
|
const double *bbox,
|
|
double &dT,
|
|
double &T,
|
|
int &RK4,
|
|
int &apply_bam_bc,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps,
|
|
int &co,
|
|
int &keep_resident_state,
|
|
int &apply_enforce_ga,
|
|
double &chitiny);
|
|
|
|
int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **constraint_host_out,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps);
|
|
|
|
int bssn_em_cuda_rk4_substep(void *block_tag,
|
|
int *ex, double *X, double *Y, double *Z,
|
|
double **state_host_in,
|
|
double **state_host_out,
|
|
double **source_host,
|
|
const double *propspeed,
|
|
const double *soa_flat,
|
|
const double *bbox,
|
|
double &dT,
|
|
double &T,
|
|
int &RK4,
|
|
int &apply_bam_bc,
|
|
int &Symmetry,
|
|
int &Lev,
|
|
double &eps,
|
|
int &co,
|
|
int &keep_resident_state,
|
|
int &apply_enforce_ga,
|
|
double &chitiny);
|
|
|
|
int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
|
|
|
|
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
|
int state_index,
|
|
double *host_state,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_copy_state_region_from_host(void *block_tag,
|
|
int state_index,
|
|
double *host_state,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_download_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out);
|
|
|
|
int bssn_escalar_cuda_download_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out);
|
|
|
|
int bssn_cuda_upload_resident_state_count(void *block_tag,
|
|
int *ex,
|
|
double **state_host_in,
|
|
int state_count);
|
|
|
|
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_in);
|
|
|
|
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
|
|
int *ex,
|
|
double **state_host_key,
|
|
int state_count);
|
|
|
|
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
|
|
int *ex,
|
|
double **state_host_key);
|
|
|
|
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out,
|
|
int state_count);
|
|
|
|
int bssn_cuda_download_resident_state_if_present(void *block_tag,
|
|
int *ex,
|
|
double **state_host_out);
|
|
|
|
int bssn_cuda_download_constraint_outputs(int *ex,
|
|
double **constraint_host_out);
|
|
|
|
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
|
int state_index,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_interp_state_point3(void *block_tag,
|
|
int *ex,
|
|
int state0,
|
|
int state1,
|
|
int state2,
|
|
double x0,
|
|
double y0,
|
|
double z0,
|
|
double dx,
|
|
double dy,
|
|
double dz,
|
|
double px,
|
|
double py,
|
|
double pz,
|
|
int ordn,
|
|
int symmetry,
|
|
double **state_host_key,
|
|
const double *soa3,
|
|
double *out3);
|
|
|
|
int bssn_cuda_interp_host_two_fields(void *block_tag,
|
|
int *ex,
|
|
double *field0,
|
|
double *field1,
|
|
double x0,
|
|
double y0,
|
|
double z0,
|
|
double dx,
|
|
double dy,
|
|
double dz,
|
|
const double *px,
|
|
const double *py,
|
|
const double *pz,
|
|
int npoints,
|
|
int ordn,
|
|
int symmetry,
|
|
const double *soa6,
|
|
double *out_interleaved);
|
|
|
|
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
|
int state_index,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
int state_index,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *host_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int i0, int j0, int k0,
|
|
int sx, int sy, int sz);
|
|
|
|
int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta);
|
|
|
|
int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta);
|
|
|
|
int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta);
|
|
|
|
int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta);
|
|
|
|
int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta);
|
|
|
|
int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
const double *state_soa);
|
|
|
|
int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta);
|
|
|
|
int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int segment_count,
|
|
const int *segment_meta,
|
|
const double *state_soa);
|
|
|
|
int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int fi0, int fj0, int fk0);
|
|
|
|
int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int fi0, int fj0, int fk0,
|
|
const double *state_soa);
|
|
|
|
int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int ii0, int jj0, int kk0,
|
|
int lbc_i, int lbc_j, int lbc_k);
|
|
|
|
int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
|
|
double **state_host_key,
|
|
int state_count,
|
|
double *device_buffer,
|
|
int *ex,
|
|
int sx, int sy, int sz,
|
|
int ii0, int jj0, int kk0,
|
|
int lbc_i, int lbc_j, int lbc_k,
|
|
const double *state_soa);
|
|
|
|
int bssn_cuda_download_state_subset(void *block_tag,
|
|
int *ex,
|
|
int subset_count,
|
|
const int *state_indices,
|
|
double **state_host_out);
|
|
|
|
int bssn_cuda_upload_state_subset(void *block_tag,
|
|
int *ex,
|
|
int subset_count,
|
|
const int *state_indices,
|
|
double **state_host_in);
|
|
|
|
int bssn_cuda_prepare_inter_time_level(void *block_tag,
|
|
int *ex,
|
|
int state_count,
|
|
double **src1_host_key,
|
|
double **src2_host_key,
|
|
double **src3_host_key,
|
|
double **dst_host_key,
|
|
int source_count,
|
|
int tindex);
|
|
|
|
int bssn_cuda_has_resident_state(void *block_tag);
|
|
|
|
void bssn_cuda_release_step_ctx(void *block_tag);
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
// C++-only helpers declared for derived equation classes (Z4C, etc.)
|
|
// Defined in bssn_class.C. Requires MyList, Patch, var from including TU.
|
|
bool bssn_cuda_use_resident_sync(int lev);
|
|
void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank);
|
|
#endif
|
|
|
|
#endif
|