Add batched first/second derivative kernels for CUDA RHS

This commit is contained in:
2026-04-13 20:51:08 +08:00
parent e952ee8e91
commit b3ec244cf9

View File

@@ -189,6 +189,33 @@ __device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) {
return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1];
}
__device__ __forceinline__ double fetch_sym_ord2_direct(const double *src,
int iF, int jF, int kF,
int SoA0, int SoA1, int SoA2)
{
int siF = iF;
int sjF = jF;
int skF = kF;
double sign = 1.0;
if (iF <= 0) {
siF = 1 - iF;
sign *= (double)SoA0;
}
if (jF <= 0) {
sjF = 1 - jF;
sign *= (double)SoA1;
}
if (kF <= 0) {
skF = 1 - kF;
sign *= (double)SoA2;
}
return sign * src[(siF - 1)
+ (sjF - 1) * d_gp.ex[0]
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
}
/* ord=2 ghost-padded: Fortran index iF -> flat index */
__device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) {
return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny;
@@ -1272,6 +1299,25 @@ struct LopsidedKodisTables {
int soa_signs[3 * BSSN_LK_FIELD_COUNT];
};
struct FDerivTables {
const double *src_fields[BSSN_STATE_COUNT];
double *fx_fields[BSSN_STATE_COUNT];
double *fy_fields[BSSN_STATE_COUNT];
double *fz_fields[BSSN_STATE_COUNT];
int soa_signs[3 * BSSN_STATE_COUNT];
};
struct FDDerivTables {
const double *src_fields[BSSN_STATE_COUNT];
double *fxx_fields[BSSN_STATE_COUNT];
double *fxy_fields[BSSN_STATE_COUNT];
double *fxz_fields[BSSN_STATE_COUNT];
double *fyy_fields[BSSN_STATE_COUNT];
double *fyz_fields[BSSN_STATE_COUNT];
double *fzz_fields[BSSN_STATE_COUNT];
int soa_signs[3 * BSSN_STATE_COUNT];
};
struct Rk4FinalizeTables {
const double *f0_fields[BSSN_STATE_COUNT];
double *rhs_fields[BSSN_STATE_COUNT];
@@ -1291,6 +1337,248 @@ static inline int grid(size_t n) {
return (int)g;
}
__global__ __launch_bounds__(128, 4)
void kern_fderivs_batched(FDerivTables tables, int field_count)
{
const int field = blockIdx.y;
if (field >= field_count) return;
const double *src = tables.src_fields[field];
double *fx = tables.fx_fields[field];
double *fy = tables.fy_fields[field];
double *fz = tables.fz_fields[field];
const int SoA0 = tables.soa_signs[3 * field + 0];
const int SoA1 = tables.soa_signs[3 * field + 1];
const int SoA2 = tables.soa_signs[3 * field + 2];
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= d_gp.all) return;
const int i0 = tid % nx;
const int j0 = (tid / nx) % ny;
const int k0 = tid / (nx * ny);
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
fx[tid] = 0.0;
fy[tid] = 0.0;
fz[tid] = 0.0;
return;
}
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
fx[tid] = d_gp.d12dx * (
fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
fy[tid] = d_gp.d12dy * (
fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
fz[tid] = d_gp.d12dz * (
fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
}
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
fx[tid] = d_gp.d2dx * (
-fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
+fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
fy[tid] = d_gp.d2dy * (
-fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
+fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
fz[tid] = d_gp.d2dz * (
-fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
+fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
}
else {
fx[tid] = 0.0;
fy[tid] = 0.0;
fz[tid] = 0.0;
}
}
__global__ __launch_bounds__(128, 4)
void kern_fdderivs_batched(FDDerivTables tables, int field_count)
{
const int field = blockIdx.y;
if (field >= field_count) return;
const double *src = tables.src_fields[field];
double *fxx = tables.fxx_fields[field];
double *fxy = tables.fxy_fields[field];
double *fxz = tables.fxz_fields[field];
double *fyy = tables.fyy_fields[field];
double *fyz = tables.fyz_fields[field];
double *fzz = tables.fzz_fields[field];
const int SoA0 = tables.soa_signs[3 * field + 0];
const int SoA1 = tables.soa_signs[3 * field + 1];
const int SoA2 = tables.soa_signs[3 * field + 2];
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid >= d_gp.all) return;
const int i0 = tid % nx;
const int j0 = (tid / nx) % ny;
const int k0 = tid / (nx * ny);
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
return;
}
const int iF = i0 + 1;
const int jF = j0 + 1;
const int kF = k0 + 1;
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
{
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
fxx[tid] = d_gp.Fdxdx * (
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
fyy[tid] = d_gp.Fdydy * (
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
fzz[tid] = d_gp.Fdzdz * (
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
-30.0 * c
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
const double t_jm2 =
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
const double t_jm1 =
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
const double t_jp1 =
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
const double t_jp2 =
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
const double t_km2_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
const double t_km1_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
const double t_kp1_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
const double t_kp2_x =
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
const double t_km2_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
const double t_km1_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
const double t_kp1_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
const double t_kp2_y =
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
}
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
{
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
fxx[tid] = d_gp.Sdxdx * (
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
- 2.0 * c
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
fyy[tid] = d_gp.Sdydy * (
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
- 2.0 * c
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
fzz[tid] = d_gp.Sdzdz * (
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
- 2.0 * c
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
fxy[tid] = d_gp.Sdxdy * (
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
fxz[tid] = d_gp.Sdxdz * (
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
fyz[tid] = d_gp.Sdydz * (
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
}
else {
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
}
}
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
double SoA0, double SoA1, double SoA2, int all)
@@ -1321,6 +1609,58 @@ static void gpu_fdderivs(double *d_f,
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
}
static void gpu_fderivs_batch(int field_count,
double *const *src_fields,
double *const *fx_fields,
double *const *fy_fields,
double *const *fz_fields,
const int *soa_signs,
int all)
{
if (field_count <= 0) return;
FDerivTables tables = {};
for (int i = 0; i < field_count; ++i) {
tables.src_fields[i] = src_fields[i];
tables.fx_fields[i] = fx_fields[i];
tables.fy_fields[i] = fy_fields[i];
tables.fz_fields[i] = fz_fields[i];
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
}
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
kern_fderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
}
static void gpu_fdderivs_batch(int field_count,
double *const *src_fields,
double *const *fxx_fields,
double *const *fxy_fields,
double *const *fxz_fields,
double *const *fyy_fields,
double *const *fyz_fields,
double *const *fzz_fields,
const int *soa_signs,
int all)
{
if (field_count <= 0) return;
FDDerivTables tables = {};
for (int i = 0; i < field_count; ++i) {
tables.src_fields[i] = src_fields[i];
tables.fxx_fields[i] = fxx_fields[i];
tables.fxy_fields[i] = fxy_fields[i];
tables.fxz_fields[i] = fxz_fields[i];
tables.fyy_fields[i] = fyy_fields[i];
tables.fyz_fields[i] = fyz_fields[i];
tables.fzz_fields[i] = fzz_fields[i];
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
}
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
}
/* Combined ord=3 advection + KO dissipation.
* When advection and KO use the same source field, symmetry packing is shared.
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
@@ -3203,23 +3543,49 @@ static void launch_rhs_pipeline(int all, double eps, int co)
const double ANTI = -1.0;
#define D(s) g_buf.slot[s]
kern_phase1_prep<<<grid(all),BLK>>>(
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all);
{
double *src_fields[] = {
D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy),
D(S_gyz), D(S_dzz), D(S_Lap), D(S_trK)
};
double *fx_fields[] = {
D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx),
D(S_gyzx), D(S_gzzx), D(S_Lapx), D(S_Kx)
};
double *fy_fields[] = {
D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy),
D(S_gyzy), D(S_gzzy), D(S_Lapy), D(S_Ky)
};
double *fz_fields[] = {
D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz),
D(S_gyzz), D(S_gzzz), D(S_Lapz), D(S_Kz)
};
const int soa_signs[] = {
(int)ANTI, (int)SYM, (int)SYM,
(int)SYM, (int)ANTI, (int)SYM,
(int)SYM, (int)SYM, (int)ANTI,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM,
(int)ANTI, (int)ANTI, (int)SYM,
(int)ANTI, (int)SYM, (int)ANTI,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)ANTI, (int)ANTI,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)SYM, (int)SYM
};
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
src_fields, fx_fields, fy_fields, fz_fields,
soa_signs, all);
}
kern_phase2_metric_rhs<<<grid(all),BLK>>>(
D(S_alpn1), D(S_chin1),
@@ -3285,15 +3651,38 @@ static void launch_rhs_pipeline(int all, double eps, int co)
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);
{
double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)};
double *fxx_fields[] = {D(S_gxxx), D(S_gxxy), D(S_gxxz)};
double *fxy_fields[] = {D(S_gxyx), D(S_gxyy), D(S_gxyz)};
double *fxz_fields[] = {D(S_gxzx), D(S_gxzy), D(S_gxzz)};
double *fyy_fields[] = {D(S_gyyx), D(S_gyyy), D(S_gyyz)};
double *fyz_fields[] = {D(S_gyzx), D(S_gyzy), D(S_gyzz)};
double *fzz_fields[] = {D(S_gzzx), D(S_gzzy), D(S_gzzz)};
const int soa_signs[] = {
(int)ANTI, (int)SYM, (int)SYM,
(int)SYM, (int)ANTI, (int)SYM,
(int)SYM, (int)SYM, (int)ANTI
};
gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
src_fields, fxx_fields, fxy_fields, fxz_fields,
fyy_fields, fyz_fields, fzz_fields,
soa_signs, all);
}
{
double *src_fields[] = {D(S_Gamx), D(S_Gamy), D(S_Gamz)};
double *fx_fields[] = {D(S_Gamxx), D(S_Gamyx), D(S_Gamzx)};
double *fy_fields[] = {D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy)};
double *fz_fields[] = {D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t)};
const int soa_signs[] = {
(int)ANTI, (int)SYM, (int)SYM,
(int)SYM, (int)ANTI, (int)SYM,
(int)SYM, (int)SYM, (int)ANTI
};
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
src_fields, fx_fields, fy_fields, fz_fields,
soa_signs, all);
}
kern_phase8_gamma_rhs_part2<<<grid(all),BLK>>>(
D(S_gupxx), D(S_gupxy), D(S_gupxz),
@@ -3462,12 +3851,23 @@ static void launch_rhs_pipeline(int all, double eps, int co)
gpu_lopsided_kodis_state_batch(eps, all);
if (co == 0) {
gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
{
double *src_fields[] = {D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz)};
double *fx_fields[] = {D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx)};
double *fy_fields[] = {D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy)};
double *fz_fields[] = {D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz)};
const int soa_signs[] = {
(int)SYM, (int)SYM, (int)SYM,
(int)ANTI, (int)ANTI, (int)SYM,
(int)ANTI, (int)SYM, (int)ANTI,
(int)SYM, (int)SYM, (int)SYM,
(int)SYM, (int)ANTI, (int)ANTI,
(int)SYM, (int)SYM, (int)SYM
};
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
src_fields, fx_fields, fy_fields, fz_fields,
soa_signs, all);
}
kern_phase18_constraints<<<grid(all),BLK>>>(
D(S_chin1),