Add batched first/second derivative kernels for CUDA RHS
This commit is contained in:
@@ -189,6 +189,33 @@ __device__ __forceinline__ int idx_ex_d(int i0, int j0, int k0) {
|
||||
return i0 + j0 * d_gp.ex[0] + k0 * d_gp.ex[0] * d_gp.ex[1];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ double fetch_sym_ord2_direct(const double *src,
|
||||
int iF, int jF, int kF,
|
||||
int SoA0, int SoA1, int SoA2)
|
||||
{
|
||||
int siF = iF;
|
||||
int sjF = jF;
|
||||
int skF = kF;
|
||||
double sign = 1.0;
|
||||
|
||||
if (iF <= 0) {
|
||||
siF = 1 - iF;
|
||||
sign *= (double)SoA0;
|
||||
}
|
||||
if (jF <= 0) {
|
||||
sjF = 1 - jF;
|
||||
sign *= (double)SoA1;
|
||||
}
|
||||
if (kF <= 0) {
|
||||
skF = 1 - kF;
|
||||
sign *= (double)SoA2;
|
||||
}
|
||||
|
||||
return sign * src[(siF - 1)
|
||||
+ (sjF - 1) * d_gp.ex[0]
|
||||
+ (skF - 1) * d_gp.ex[0] * d_gp.ex[1]];
|
||||
}
|
||||
|
||||
/* ord=2 ghost-padded: Fortran index iF -> flat index */
|
||||
__device__ __forceinline__ int idx_fh2(int iF, int jF, int kF) {
|
||||
return (iF + 1) + (jF + 1) * d_gp.fh2_nx + (kF + 1) * d_gp.fh2_nx * d_gp.fh2_ny;
|
||||
@@ -1272,6 +1299,25 @@ struct LopsidedKodisTables {
|
||||
int soa_signs[3 * BSSN_LK_FIELD_COUNT];
|
||||
};
|
||||
|
||||
struct FDerivTables {
|
||||
const double *src_fields[BSSN_STATE_COUNT];
|
||||
double *fx_fields[BSSN_STATE_COUNT];
|
||||
double *fy_fields[BSSN_STATE_COUNT];
|
||||
double *fz_fields[BSSN_STATE_COUNT];
|
||||
int soa_signs[3 * BSSN_STATE_COUNT];
|
||||
};
|
||||
|
||||
struct FDDerivTables {
|
||||
const double *src_fields[BSSN_STATE_COUNT];
|
||||
double *fxx_fields[BSSN_STATE_COUNT];
|
||||
double *fxy_fields[BSSN_STATE_COUNT];
|
||||
double *fxz_fields[BSSN_STATE_COUNT];
|
||||
double *fyy_fields[BSSN_STATE_COUNT];
|
||||
double *fyz_fields[BSSN_STATE_COUNT];
|
||||
double *fzz_fields[BSSN_STATE_COUNT];
|
||||
int soa_signs[3 * BSSN_STATE_COUNT];
|
||||
};
|
||||
|
||||
struct Rk4FinalizeTables {
|
||||
const double *f0_fields[BSSN_STATE_COUNT];
|
||||
double *rhs_fields[BSSN_STATE_COUNT];
|
||||
@@ -1291,6 +1337,248 @@ static inline int grid(size_t n) {
|
||||
return (int)g;
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_fderivs_batched(FDerivTables tables, int field_count)
|
||||
{
|
||||
const int field = blockIdx.y;
|
||||
if (field >= field_count) return;
|
||||
|
||||
const double *src = tables.src_fields[field];
|
||||
double *fx = tables.fx_fields[field];
|
||||
double *fy = tables.fy_fields[field];
|
||||
double *fz = tables.fz_fields[field];
|
||||
const int SoA0 = tables.soa_signs[3 * field + 0];
|
||||
const int SoA1 = tables.soa_signs[3 * field + 1];
|
||||
const int SoA2 = tables.soa_signs[3 * field + 2];
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
const int k0 = tid / (nx * ny);
|
||||
|
||||
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
||||
fx[tid] = 0.0;
|
||||
fy[tid] = 0.0;
|
||||
fz[tid] = 0.0;
|
||||
return;
|
||||
}
|
||||
|
||||
const int iF = i0 + 1;
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
fx[tid] = d_gp.d12dx * (
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
||||
fy[tid] = d_gp.d12dy * (
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
||||
fz[tid] = d_gp.d12dz * (
|
||||
fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
||||
}
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
fx[tid] = d_gp.d2dx * (
|
||||
-fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
+fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
||||
fy[tid] = d_gp.d2dy * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
+fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||
fz[tid] = d_gp.d2dz * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
+fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||
}
|
||||
else {
|
||||
fx[tid] = 0.0;
|
||||
fy[tid] = 0.0;
|
||||
fz[tid] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
__global__ __launch_bounds__(128, 4)
|
||||
void kern_fdderivs_batched(FDDerivTables tables, int field_count)
|
||||
{
|
||||
const int field = blockIdx.y;
|
||||
if (field >= field_count) return;
|
||||
|
||||
const double *src = tables.src_fields[field];
|
||||
double *fxx = tables.fxx_fields[field];
|
||||
double *fxy = tables.fxy_fields[field];
|
||||
double *fxz = tables.fxz_fields[field];
|
||||
double *fyy = tables.fyy_fields[field];
|
||||
double *fyz = tables.fyz_fields[field];
|
||||
double *fzz = tables.fzz_fields[field];
|
||||
const int SoA0 = tables.soa_signs[3 * field + 0];
|
||||
const int SoA1 = tables.soa_signs[3 * field + 1];
|
||||
const int SoA2 = tables.soa_signs[3 * field + 2];
|
||||
const int nx = d_gp.ex[0], ny = d_gp.ex[1], nz = d_gp.ex[2];
|
||||
const int imaxF = d_gp.imaxF, jmaxF = d_gp.jmaxF, kmaxF = d_gp.kmaxF;
|
||||
const int iminF = d_gp.iminF, jminF = d_gp.jminF, kminF = d_gp.kminF;
|
||||
|
||||
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (tid >= d_gp.all) return;
|
||||
|
||||
const int i0 = tid % nx;
|
||||
const int j0 = (tid / nx) % ny;
|
||||
const int k0 = tid / (nx * ny);
|
||||
|
||||
if (i0 > nx - 2 || j0 > ny - 2 || k0 > nz - 2) {
|
||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||
return;
|
||||
}
|
||||
|
||||
const int iF = i0 + 1;
|
||||
const int jF = j0 + 1;
|
||||
const int kF = k0 + 1;
|
||||
|
||||
if ((iF + 2) <= imaxF && (iF - 2) >= iminF &&
|
||||
(jF + 2) <= jmaxF && (jF - 2) >= jminF &&
|
||||
(kF + 2) <= kmaxF && (kF - 2) >= kminF)
|
||||
{
|
||||
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||
fxx[tid] = d_gp.Fdxdx * (
|
||||
-fetch_sym_ord2_direct(src, iF - 2, jF, kF, SoA0, SoA1, SoA2)
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF, SoA0, SoA1, SoA2));
|
||||
fyy[tid] = d_gp.Fdydy * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF, SoA0, SoA1, SoA2));
|
||||
fzz[tid] = d_gp.Fdzdz * (
|
||||
-fetch_sym_ord2_direct(src, iF, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
-30.0 * c
|
||||
+16.0 * fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF, kF + 2, SoA0, SoA1, SoA2));
|
||||
|
||||
const double t_jm2 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 2, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF - 2, kF, SoA0, SoA1, SoA2);
|
||||
const double t_jm1 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF - 1, kF, SoA0, SoA1, SoA2);
|
||||
const double t_jp1 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF + 1, kF, SoA0, SoA1, SoA2);
|
||||
const double t_jp2 =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF + 2, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF + 2, kF, SoA0, SoA1, SoA2);
|
||||
fxy[tid] = d_gp.Fdxdy * (t_jm2 - 8.0 * t_jm1 + 8.0 * t_jp1 - t_jp2);
|
||||
|
||||
const double t_km2_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 2, SoA0, SoA1, SoA2);
|
||||
const double t_km1_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF - 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp1_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp2_x =
|
||||
fetch_sym_ord2_direct(src, iF - 2, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF - 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF + 1, jF, kF + 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 2, jF, kF + 2, SoA0, SoA1, SoA2);
|
||||
fxz[tid] = d_gp.Fdxdz * (t_km2_x - 8.0 * t_km1_x + 8.0 * t_kp1_x - t_kp2_x);
|
||||
|
||||
const double t_km2_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 2, SoA0, SoA1, SoA2);
|
||||
const double t_km1_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF - 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF - 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp1_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 1, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 1, SoA0, SoA1, SoA2);
|
||||
const double t_kp2_y =
|
||||
fetch_sym_ord2_direct(src, iF, jF - 2, kF + 2, SoA0, SoA1, SoA2)
|
||||
- 8.0 * fetch_sym_ord2_direct(src, iF, jF - 1, kF + 2, SoA0, SoA1, SoA2)
|
||||
+ 8.0 * fetch_sym_ord2_direct(src, iF, jF + 1, kF + 2, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 2, kF + 2, SoA0, SoA1, SoA2);
|
||||
fyz[tid] = d_gp.Fdydz * (t_km2_y - 8.0 * t_km1_y + 8.0 * t_kp1_y - t_kp2_y);
|
||||
}
|
||||
else if ((iF + 1) <= imaxF && (iF - 1) >= iminF &&
|
||||
(jF + 1) <= jmaxF && (jF - 1) >= jminF &&
|
||||
(kF + 1) <= kmaxF && (kF - 1) >= kminF)
|
||||
{
|
||||
const double c = fetch_sym_ord2_direct(src, iF, jF, kF, SoA0, SoA1, SoA2);
|
||||
fxx[tid] = d_gp.Sdxdx * (
|
||||
fetch_sym_ord2_direct(src, iF - 1, jF, kF, SoA0, SoA1, SoA2)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF, SoA0, SoA1, SoA2));
|
||||
fyy[tid] = d_gp.Sdydy * (
|
||||
fetch_sym_ord2_direct(src, iF, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||
fzz[tid] = d_gp.Sdzdz * (
|
||||
fetch_sym_ord2_direct(src, iF, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- 2.0 * c
|
||||
+ fetch_sym_ord2_direct(src, iF, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||
fxy[tid] = d_gp.Sdxdy * (
|
||||
fetch_sym_ord2_direct(src, iF - 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 1, jF - 1, kF, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF - 1, jF + 1, kF, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF + 1, jF + 1, kF, SoA0, SoA1, SoA2));
|
||||
fxz[tid] = d_gp.Sdxdz * (
|
||||
fetch_sym_ord2_direct(src, iF - 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF + 1, jF, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF - 1, jF, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF + 1, jF, kF + 1, SoA0, SoA1, SoA2));
|
||||
fyz[tid] = d_gp.Sdydz * (
|
||||
fetch_sym_ord2_direct(src, iF, jF - 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF + 1, kF - 1, SoA0, SoA1, SoA2)
|
||||
- fetch_sym_ord2_direct(src, iF, jF - 1, kF + 1, SoA0, SoA1, SoA2)
|
||||
+ fetch_sym_ord2_direct(src, iF, jF + 1, kF + 1, SoA0, SoA1, SoA2));
|
||||
}
|
||||
else {
|
||||
fxx[tid] = 0.0; fxy[tid] = 0.0; fxz[tid] = 0.0;
|
||||
fyy[tid] = 0.0; fyz[tid] = 0.0; fzz[tid] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
/* symmetry_bd on GPU for ord=2, then launch fderivs kernel */
|
||||
static void gpu_fderivs(double *d_f, double *d_fx, double *d_fy, double *d_fz,
|
||||
double SoA0, double SoA1, double SoA2, int all)
|
||||
@@ -1321,6 +1609,58 @@ static void gpu_fdderivs(double *d_f,
|
||||
kern_fdderivs<<<grid(all), BLK>>>(fh, d_fxx, d_fxy, d_fxz, d_fyy, d_fyz, d_fzz);
|
||||
}
|
||||
|
||||
static void gpu_fderivs_batch(int field_count,
|
||||
double *const *src_fields,
|
||||
double *const *fx_fields,
|
||||
double *const *fy_fields,
|
||||
double *const *fz_fields,
|
||||
const int *soa_signs,
|
||||
int all)
|
||||
{
|
||||
if (field_count <= 0) return;
|
||||
FDerivTables tables = {};
|
||||
for (int i = 0; i < field_count; ++i) {
|
||||
tables.src_fields[i] = src_fields[i];
|
||||
tables.fx_fields[i] = fx_fields[i];
|
||||
tables.fy_fields[i] = fy_fields[i];
|
||||
tables.fz_fields[i] = fz_fields[i];
|
||||
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
||||
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
||||
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
||||
}
|
||||
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
|
||||
kern_fderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
||||
}
|
||||
|
||||
static void gpu_fdderivs_batch(int field_count,
|
||||
double *const *src_fields,
|
||||
double *const *fxx_fields,
|
||||
double *const *fxy_fields,
|
||||
double *const *fxz_fields,
|
||||
double *const *fyy_fields,
|
||||
double *const *fyz_fields,
|
||||
double *const *fzz_fields,
|
||||
const int *soa_signs,
|
||||
int all)
|
||||
{
|
||||
if (field_count <= 0) return;
|
||||
FDDerivTables tables = {};
|
||||
for (int i = 0; i < field_count; ++i) {
|
||||
tables.src_fields[i] = src_fields[i];
|
||||
tables.fxx_fields[i] = fxx_fields[i];
|
||||
tables.fxy_fields[i] = fxy_fields[i];
|
||||
tables.fxz_fields[i] = fxz_fields[i];
|
||||
tables.fyy_fields[i] = fyy_fields[i];
|
||||
tables.fyz_fields[i] = fyz_fields[i];
|
||||
tables.fzz_fields[i] = fzz_fields[i];
|
||||
tables.soa_signs[3 * i + 0] = soa_signs[3 * i + 0];
|
||||
tables.soa_signs[3 * i + 1] = soa_signs[3 * i + 1];
|
||||
tables.soa_signs[3 * i + 2] = soa_signs[3 * i + 2];
|
||||
}
|
||||
dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)field_count);
|
||||
kern_fdderivs_batched<<<launch_grid, BLK>>>(tables, field_count);
|
||||
}
|
||||
|
||||
/* Combined ord=3 advection + KO dissipation.
|
||||
* When advection and KO use the same source field, symmetry packing is shared.
|
||||
* If they differ (e.g. gxx advection + dxx KO), only KO repacks.
|
||||
@@ -3203,23 +3543,49 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
const double ANTI = -1.0;
|
||||
|
||||
#define D(s) g_buf.slot[s]
|
||||
|
||||
kern_phase1_prep<<<grid(all),BLK>>>(
|
||||
D(S_Lap), D(S_chi), D(S_dxx), D(S_dyy), D(S_dzz),
|
||||
D(S_alpn1), D(S_chin1), D(S_gxx), D(S_gyy), D(S_gzz));
|
||||
|
||||
gpu_fderivs(D(S_betax), D(S_betaxx),D(S_betaxy),D(S_betaxz), ANTI,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_betay), D(S_betayx),D(S_betayy),D(S_betayz), SYM,ANTI,SYM, all);
|
||||
gpu_fderivs(D(S_betaz), D(S_betazx),D(S_betazy),D(S_betazz), SYM,SYM,ANTI, all);
|
||||
gpu_fderivs(D(S_chi), D(S_chix),D(S_chiy),D(S_chiz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_dxx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_gxy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
|
||||
gpu_fderivs(D(S_gxz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
|
||||
gpu_fderivs(D(S_dyy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_gyz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
|
||||
gpu_fderivs(D(S_dzz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_Lap), D(S_Lapx),D(S_Lapy),D(S_Lapz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_trK), D(S_Kx),D(S_Ky),D(S_Kz), SYM,SYM,SYM, all);
|
||||
{
|
||||
double *src_fields[] = {
|
||||
D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
|
||||
D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy),
|
||||
D(S_gyz), D(S_dzz), D(S_Lap), D(S_trK)
|
||||
};
|
||||
double *fx_fields[] = {
|
||||
D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
|
||||
D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx),
|
||||
D(S_gyzx), D(S_gzzx), D(S_Lapx), D(S_Kx)
|
||||
};
|
||||
double *fy_fields[] = {
|
||||
D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
|
||||
D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy),
|
||||
D(S_gyzy), D(S_gzzy), D(S_Lapy), D(S_Ky)
|
||||
};
|
||||
double *fz_fields[] = {
|
||||
D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
|
||||
D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz),
|
||||
D(S_gyzz), D(S_gzzz), D(S_Lapz), D(S_Kz)
|
||||
};
|
||||
const int soa_signs[] = {
|
||||
(int)ANTI, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)ANTI, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)ANTI,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)ANTI, (int)ANTI, (int)SYM,
|
||||
(int)ANTI, (int)SYM, (int)ANTI,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)ANTI, (int)ANTI,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)SYM
|
||||
};
|
||||
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
||||
src_fields, fx_fields, fy_fields, fz_fields,
|
||||
soa_signs, all);
|
||||
}
|
||||
|
||||
kern_phase2_metric_rhs<<<grid(all),BLK>>>(
|
||||
D(S_alpn1), D(S_chin1),
|
||||
@@ -3285,15 +3651,38 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
D(S_Gamzyy), D(S_Gamzyz), D(S_Gamzzz),
|
||||
D(S_Gamx_rhs), D(S_Gamy_rhs), D(S_Gamz_rhs));
|
||||
|
||||
gpu_fdderivs(D(S_betax), D(S_gxxx),D(S_gxyx),D(S_gxzx),
|
||||
D(S_gyyx),D(S_gyzx),D(S_gzzx), ANTI,SYM,SYM, all);
|
||||
gpu_fdderivs(D(S_betay), D(S_gxxy),D(S_gxyy),D(S_gxzy),
|
||||
D(S_gyyy),D(S_gyzy),D(S_gzzy), SYM,ANTI,SYM, all);
|
||||
gpu_fdderivs(D(S_betaz), D(S_gxxz),D(S_gxyz),D(S_gxzz),
|
||||
D(S_gyyz),D(S_gyzz),D(S_gzzz), SYM,SYM,ANTI, all);
|
||||
gpu_fderivs(D(S_Gamx), D(S_Gamxx),D(S_Gamxy),D(S_Gamxz), ANTI,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_Gamy), D(S_Gamyx),D(S_Gamyy_t),D(S_Gamyz_t), SYM,ANTI,SYM, all);
|
||||
gpu_fderivs(D(S_Gamz), D(S_Gamzx),D(S_Gamzy),D(S_Gamzz_t), SYM,SYM,ANTI, all);
|
||||
{
|
||||
double *src_fields[] = {D(S_betax), D(S_betay), D(S_betaz)};
|
||||
double *fxx_fields[] = {D(S_gxxx), D(S_gxxy), D(S_gxxz)};
|
||||
double *fxy_fields[] = {D(S_gxyx), D(S_gxyy), D(S_gxyz)};
|
||||
double *fxz_fields[] = {D(S_gxzx), D(S_gxzy), D(S_gxzz)};
|
||||
double *fyy_fields[] = {D(S_gyyx), D(S_gyyy), D(S_gyyz)};
|
||||
double *fyz_fields[] = {D(S_gyzx), D(S_gyzy), D(S_gyzz)};
|
||||
double *fzz_fields[] = {D(S_gzzx), D(S_gzzy), D(S_gzzz)};
|
||||
const int soa_signs[] = {
|
||||
(int)ANTI, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)ANTI, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)ANTI
|
||||
};
|
||||
gpu_fdderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
||||
src_fields, fxx_fields, fxy_fields, fxz_fields,
|
||||
fyy_fields, fyz_fields, fzz_fields,
|
||||
soa_signs, all);
|
||||
}
|
||||
{
|
||||
double *src_fields[] = {D(S_Gamx), D(S_Gamy), D(S_Gamz)};
|
||||
double *fx_fields[] = {D(S_Gamxx), D(S_Gamyx), D(S_Gamzx)};
|
||||
double *fy_fields[] = {D(S_Gamxy), D(S_Gamyy_t), D(S_Gamzy)};
|
||||
double *fz_fields[] = {D(S_Gamxz), D(S_Gamyz_t), D(S_Gamzz_t)};
|
||||
const int soa_signs[] = {
|
||||
(int)ANTI, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)ANTI, (int)SYM,
|
||||
(int)SYM, (int)SYM, (int)ANTI
|
||||
};
|
||||
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
||||
src_fields, fx_fields, fy_fields, fz_fields,
|
||||
soa_signs, all);
|
||||
}
|
||||
|
||||
kern_phase8_gamma_rhs_part2<<<grid(all),BLK>>>(
|
||||
D(S_gupxx), D(S_gupxy), D(S_gupxz),
|
||||
@@ -3462,12 +3851,23 @@ static void launch_rhs_pipeline(int all, double eps, int co)
|
||||
gpu_lopsided_kodis_state_batch(eps, all);
|
||||
|
||||
if (co == 0) {
|
||||
gpu_fderivs(D(S_Axx), D(S_gxxx),D(S_gxxy),D(S_gxxz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_Axy), D(S_gxyx),D(S_gxyy),D(S_gxyz), ANTI,ANTI,SYM, all);
|
||||
gpu_fderivs(D(S_Axz), D(S_gxzx),D(S_gxzy),D(S_gxzz), ANTI,SYM,ANTI, all);
|
||||
gpu_fderivs(D(S_Ayy), D(S_gyyx),D(S_gyyy),D(S_gyyz), SYM,SYM,SYM, all);
|
||||
gpu_fderivs(D(S_Ayz), D(S_gyzx),D(S_gyzy),D(S_gyzz), SYM,ANTI,ANTI, all);
|
||||
gpu_fderivs(D(S_Azz), D(S_gzzx),D(S_gzzy),D(S_gzzz), SYM,SYM,SYM, all);
|
||||
{
|
||||
double *src_fields[] = {D(S_Axx), D(S_Axy), D(S_Axz), D(S_Ayy), D(S_Ayz), D(S_Azz)};
|
||||
double *fx_fields[] = {D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx)};
|
||||
double *fy_fields[] = {D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy)};
|
||||
double *fz_fields[] = {D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz)};
|
||||
const int soa_signs[] = {
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)ANTI, (int)ANTI, (int)SYM,
|
||||
(int)ANTI, (int)SYM, (int)ANTI,
|
||||
(int)SYM, (int)SYM, (int)SYM,
|
||||
(int)SYM, (int)ANTI, (int)ANTI,
|
||||
(int)SYM, (int)SYM, (int)SYM
|
||||
};
|
||||
gpu_fderivs_batch((int)(sizeof(src_fields) / sizeof(src_fields[0])),
|
||||
src_fields, fx_fields, fy_fields, fz_fields,
|
||||
soa_signs, all);
|
||||
}
|
||||
|
||||
kern_phase18_constraints<<<grid(all),BLK>>>(
|
||||
D(S_chin1),
|
||||
|
||||
Reference in New Issue
Block a user