Files
AMSS-NCKU/AMSS_NCKU_source/bssn_rhs_cuda.h
CGH0S7 c4194214c6 Enable Z4C + Shell-Patch GPU coexistence (Phase 3)
Remove the compile-time #error that blocked USE_CUDA_Z4C + WithShell.
Add GPU-to-CPU state sync at the start of both Z4C Step functions
(non-CPBC and CPBC) so shell CPU consumers read valid field data
after Cartesian GPU RHS with resident state.

Move bssn_cuda_use_resident_sync and bssn_cuda_download_level_state
_if_present from anonymous namespace to file scope in bssn_class.C
so derived classes (Z4C) can call them. Declare both in
bssn_rhs_cuda.h. Include bssn_rhs_cuda.h in Z4c_class.C.

Z4C shell RHS remains on CPU (Fortran Z4c_rhs_ss.f90) pending
future GPU kernel implementation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-10 12:08:02 +08:00

414 lines
23 KiB
C

#ifndef BSSN_RHS_CUDA_H
#define BSSN_RHS_CUDA_H
#ifdef __cplusplus
extern "C" {
#endif
enum {
BSSN_CUDA_STATE_COUNT = 24,
BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
BSSN_EM_CUDA_STATE_COUNT = 32,
BSSN_EM_CUDA_SOURCE_COUNT = 4,
BSSN_CUDA_MATTER_COUNT = 10
};
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co);
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
double **matter_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &use_zero_matter,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_escalar_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **constraint_host_out,
int &Symmetry,
int &Lev,
double &eps);
int bssn_em_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
double **source_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index,
double *host_state,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_copy_state_region_from_host(void *block_tag,
int state_index,
double *host_state,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int bssn_escalar_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int bssn_cuda_upload_resident_state_count(void *block_tag,
int *ex,
double **state_host_in,
int state_count);
int bssn_escalar_cuda_upload_resident_state(void *block_tag,
int *ex,
double **state_host_in);
int bssn_cuda_keep_only_resident_state_count(void *block_tag,
int *ex,
double **state_host_key,
int state_count);
int bssn_escalar_cuda_keep_only_resident_state(void *block_tag,
int *ex,
double **state_host_key);
int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
int *ex,
double **state_host_out,
int state_count);
int bssn_cuda_download_resident_state_if_present(void *block_tag,
int *ex,
double **state_host_out);
int bssn_cuda_download_constraint_outputs(int *ex,
double **constraint_host_out);
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_interp_state_point3(void *block_tag,
int *ex,
int state0,
int state1,
int state2,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
double px,
double py,
double pz,
int ordn,
int symmetry,
double **state_host_key,
const double *soa3,
double *out3);
int bssn_cuda_interp_host_two_fields(void *block_tag,
int *ex,
double *field0,
double *field1,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
const double *px,
const double *py,
const double *pz,
int npoints,
int ordn,
int symmetry,
const double *soa6,
double *out_interleaved);
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_batch_to_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_batch_from_host_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_batch_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_batch_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_batch_from_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_batch_from_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_segments_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int bssn_cuda_pack_state_segments_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int bssn_cuda_unpack_state_segments_from_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int bssn_cuda_unpack_state_segments_from_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int bssn_cuda_restrict_state_segments_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int bssn_cuda_restrict_state_segments_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta,
const double *state_soa);
int bssn_cuda_prolong_state_segments_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta);
int bssn_cuda_prolong_state_segments_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int segment_count,
const int *segment_meta,
const double *state_soa);
int bssn_cuda_restrict_state_batch_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int sx, int sy, int sz,
int fi0, int fj0, int fk0);
int bssn_cuda_restrict_state_batch_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int sx, int sy, int sz,
int fi0, int fj0, int fk0,
const double *state_soa);
int bssn_cuda_prolong_state_batch_to_device_buffer(void *block_tag,
int state_count,
double *device_buffer,
int *ex,
int sx, int sy, int sz,
int ii0, int jj0, int kk0,
int lbc_i, int lbc_j, int lbc_k);
int bssn_cuda_prolong_state_batch_to_device_buffer_for_host_views(void *block_tag,
double **state_host_key,
int state_count,
double *device_buffer,
int *ex,
int sx, int sy, int sz,
int ii0, int jj0, int kk0,
int lbc_i, int lbc_j, int lbc_k,
const double *state_soa);
int bssn_cuda_download_state_subset(void *block_tag,
int *ex,
int subset_count,
const int *state_indices,
double **state_host_out);
int bssn_cuda_upload_state_subset(void *block_tag,
int *ex,
int subset_count,
const int *state_indices,
double **state_host_in);
int bssn_cuda_prepare_inter_time_level(void *block_tag,
int *ex,
int state_count,
double **src1_host_key,
double **src2_host_key,
double **src3_host_key,
double **dst_host_key,
int source_count,
int tindex);
int bssn_cuda_has_resident_state(void *block_tag);
void bssn_cuda_release_step_ctx(void *block_tag);
#ifdef __cplusplus
}
// C++-only helpers declared for derived equation classes (Z4C, etc.)
// Defined in bssn_class.C. Requires MyList, Patch, var from including TU.
bool bssn_cuda_use_resident_sync(int lev);
void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var> *vars, int myrank);
#endif
#endif