2244 lines
73 KiB
Plaintext
2244 lines
73 KiB
Plaintext
#include "bssn_cuda_ops.h"
|
|
#include "bssn_gpu.h"
|
|
#include "rungekutta4_rout.h"
|
|
#include "sommerfeld_rout.h"
|
|
|
|
#include <cmath>
|
|
#include <cstdio>
|
|
#include <cuda_runtime.h>
|
|
#include <unordered_map>
|
|
#include <vector>
|
|
|
|
namespace {
|
|
|
|
inline void report_cuda_error(const char *where, cudaError_t err)
|
|
{
|
|
if (err != cudaSuccess)
|
|
std::fprintf(stderr, "CUDA error at %s: %s\n", where, cudaGetErrorString(err));
|
|
}
|
|
|
|
inline int count_points(const int ex[3])
|
|
{
|
|
return ex[0] * ex[1] * ex[2];
|
|
}
|
|
|
|
inline int div_up(int a, int b)
|
|
{
|
|
return (a + b - 1) / b;
|
|
}
|
|
|
|
struct DeviceArrays
|
|
{
|
|
double *x = nullptr;
|
|
double *y = nullptr;
|
|
double *z = nullptr;
|
|
double *a = nullptr;
|
|
double *b = nullptr;
|
|
double *c = nullptr;
|
|
double *d = nullptr;
|
|
};
|
|
|
|
struct CachedBuffer
|
|
{
|
|
double *ptr = nullptr;
|
|
size_t capacity = 0;
|
|
};
|
|
|
|
struct CachedIntBuffer
|
|
{
|
|
int *ptr = nullptr;
|
|
size_t capacity = 0;
|
|
};
|
|
|
|
struct CachedPtrBuffer
|
|
{
|
|
void *ptr = nullptr;
|
|
size_t capacity = 0;
|
|
};
|
|
|
|
inline void release_buffer(CachedBuffer &buffer)
|
|
{
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
}
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
inline void release_buffer(CachedIntBuffer &buffer)
|
|
{
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
}
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
inline void release_buffer(CachedPtrBuffer &buffer)
|
|
{
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
}
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
inline bool ensure_capacity(CachedBuffer &buffer, size_t bytes)
|
|
{
|
|
if (bytes <= buffer.capacity && buffer.ptr)
|
|
return true;
|
|
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMalloc", err);
|
|
return false;
|
|
}
|
|
|
|
buffer.capacity = bytes;
|
|
return true;
|
|
}
|
|
|
|
inline bool ensure_capacity(CachedIntBuffer &buffer, size_t bytes)
|
|
{
|
|
if (bytes <= buffer.capacity && buffer.ptr)
|
|
return true;
|
|
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMalloc", err);
|
|
return false;
|
|
}
|
|
|
|
buffer.capacity = bytes;
|
|
return true;
|
|
}
|
|
|
|
inline bool ensure_capacity(CachedPtrBuffer &buffer, size_t bytes)
|
|
{
|
|
if (bytes <= buffer.capacity && buffer.ptr)
|
|
return true;
|
|
|
|
if (buffer.ptr)
|
|
{
|
|
cudaError_t free_err = cudaFree(buffer.ptr);
|
|
if (free_err != cudaSuccess)
|
|
report_cuda_error("cudaFree", free_err);
|
|
buffer.ptr = nullptr;
|
|
buffer.capacity = 0;
|
|
}
|
|
|
|
cudaError_t err = cudaMalloc(&buffer.ptr, bytes);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMalloc", err);
|
|
return false;
|
|
}
|
|
|
|
buffer.capacity = bytes;
|
|
return true;
|
|
}
|
|
|
|
struct Rk4VarCache
|
|
{
|
|
CachedBuffer X, Y, Z;
|
|
CachedBuffer state0, boundary, stage, rhs;
|
|
const double *host_X = nullptr;
|
|
const double *host_Y = nullptr;
|
|
const double *host_Z = nullptr;
|
|
const double *host_state0 = nullptr;
|
|
double *host_rhs = nullptr;
|
|
int nx = 0;
|
|
int ny = 0;
|
|
int nz = 0;
|
|
bool rhs_resident = false;
|
|
};
|
|
|
|
struct InterpStencilCacheEntry
|
|
{
|
|
const double *X = nullptr;
|
|
const double *Y = nullptr;
|
|
const double *Z = nullptr;
|
|
const double *px = nullptr;
|
|
const double *py = nullptr;
|
|
const double *pz = nullptr;
|
|
int nx = 0;
|
|
int ny = 0;
|
|
int nz = 0;
|
|
int num_points = 0;
|
|
int ordn = 0;
|
|
int symmetry = 0;
|
|
bool valid = false;
|
|
CachedBuffer weights;
|
|
CachedIntBuffer indices;
|
|
CachedIntBuffer reflect;
|
|
};
|
|
|
|
struct InterpBatchCache
|
|
{
|
|
CachedBuffer out;
|
|
CachedBuffer soa;
|
|
CachedBuffer field_ptrs;
|
|
CachedIntBuffer error_flag;
|
|
std::vector<CachedBuffer> host_field_copies;
|
|
InterpStencilCacheEntry stencil_entry;
|
|
};
|
|
|
|
struct Rk4BatchCache
|
|
{
|
|
CachedPtrBuffer state0_ptrs;
|
|
CachedPtrBuffer stage_ptrs;
|
|
CachedPtrBuffer rhs_ptrs;
|
|
};
|
|
|
|
std::unordered_map<const double *, Rk4VarCache> &rk4_var_cache_map()
|
|
{
|
|
static thread_local std::unordered_map<const double *, Rk4VarCache> cache_map;
|
|
return cache_map;
|
|
}
|
|
|
|
InterpBatchCache &interp_batch_cache()
|
|
{
|
|
static thread_local InterpBatchCache cache;
|
|
return cache;
|
|
}
|
|
|
|
Rk4BatchCache &rk4_batch_cache()
|
|
{
|
|
static thread_local Rk4BatchCache cache;
|
|
return cache;
|
|
}
|
|
|
|
inline void release_interp_stencil_cache(InterpStencilCacheEntry &entry)
|
|
{
|
|
release_buffer(entry.weights);
|
|
release_buffer(entry.indices);
|
|
release_buffer(entry.reflect);
|
|
entry.X = nullptr;
|
|
entry.Y = nullptr;
|
|
entry.Z = nullptr;
|
|
entry.px = nullptr;
|
|
entry.py = nullptr;
|
|
entry.pz = nullptr;
|
|
entry.nx = 0;
|
|
entry.ny = 0;
|
|
entry.nz = 0;
|
|
entry.num_points = 0;
|
|
entry.ordn = 0;
|
|
entry.symmetry = 0;
|
|
entry.valid = false;
|
|
}
|
|
|
|
inline void release_interp_batch_cache(InterpBatchCache &cache)
|
|
{
|
|
release_buffer(cache.out);
|
|
release_buffer(cache.soa);
|
|
release_buffer(cache.field_ptrs);
|
|
release_buffer(cache.error_flag);
|
|
for (size_t i = 0; i < cache.host_field_copies.size(); ++i)
|
|
release_buffer(cache.host_field_copies[i]);
|
|
cache.host_field_copies.clear();
|
|
release_interp_stencil_cache(cache.stencil_entry);
|
|
}
|
|
|
|
inline bool copy_to_device(CachedIntBuffer &dst, const int *src, size_t bytes)
|
|
{
|
|
if (!ensure_capacity(dst, bytes))
|
|
return false;
|
|
|
|
cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D int)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline bool copy_to_device(CachedBuffer &dst, const double *src, size_t bytes)
|
|
{
|
|
if (!ensure_capacity(dst, bytes))
|
|
return false;
|
|
|
|
cudaError_t err = cudaMemcpy(dst.ptr, src, bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline bool copy_to_device_preferring_device(CachedBuffer &dst, const double *src, size_t bytes)
|
|
{
|
|
if (!ensure_capacity(dst, bytes))
|
|
return false;
|
|
|
|
const double *device_src = bssn_gpu_find_device_buffer(src);
|
|
cudaMemcpyKind kind = cudaMemcpyHostToDevice;
|
|
const void *copy_src = src;
|
|
if (device_src)
|
|
{
|
|
copy_src = device_src;
|
|
kind = cudaMemcpyDeviceToDevice;
|
|
}
|
|
|
|
cudaError_t err = cudaMemcpy(dst.ptr, copy_src, bytes, kind);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error(kind == cudaMemcpyDeviceToDevice ? "cudaMemcpy(D2D)" : "cudaMemcpy(H2D)", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline bool sync_host_from_mapped_device(double *host_ptr, int count, const char *label)
|
|
{
|
|
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
|
if (!device_ptr)
|
|
return true;
|
|
|
|
bssn_gpu_prepare_host_buffer(host_ptr, count);
|
|
const size_t bytes = static_cast<size_t>(count) * sizeof(double);
|
|
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error(label, err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline bool copy_region_to_padded_stage(CachedBuffer &dst,
|
|
const double *src,
|
|
const int src_shape[3],
|
|
int src_i0, int src_j0, int src_k0,
|
|
int region_nx, int region_ny, int region_nz,
|
|
int dst_sx, int dst_sy, int dst_sz,
|
|
int dst_x0, int dst_y0, int dst_z0,
|
|
const char *where)
|
|
{
|
|
if (!src || !where || region_nx <= 0 || region_ny <= 0 || region_nz <= 0 ||
|
|
dst_sx <= 0 || dst_sy <= 0 || dst_sz <= 0)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
const size_t dst_bytes =
|
|
static_cast<size_t>(dst_sx) * static_cast<size_t>(dst_sy) * static_cast<size_t>(dst_sz) * sizeof(double);
|
|
if (!ensure_capacity(dst, dst_bytes))
|
|
return false;
|
|
|
|
const double *device_src = bssn_gpu_find_device_buffer(src);
|
|
const void *copy_src = device_src ? static_cast<const void *>(device_src) : static_cast<const void *>(src);
|
|
cudaMemcpyKind kind = device_src ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
|
|
|
|
if (!device_src)
|
|
{
|
|
const int src_count = src_shape[0] * src_shape[1] * src_shape[2];
|
|
bssn_gpu_prepare_host_buffer(src, src_count);
|
|
}
|
|
|
|
cudaMemcpy3DParms parms = {};
|
|
parms.srcPtr = make_cudaPitchedPtr(const_cast<void *>(copy_src),
|
|
static_cast<size_t>(src_shape[0]) * sizeof(double),
|
|
static_cast<size_t>(src_shape[0]),
|
|
static_cast<size_t>(src_shape[1]));
|
|
parms.dstPtr = make_cudaPitchedPtr(dst.ptr,
|
|
static_cast<size_t>(dst_sx) * sizeof(double),
|
|
static_cast<size_t>(dst_sx),
|
|
static_cast<size_t>(dst_sy));
|
|
parms.srcPos = make_cudaPos(static_cast<size_t>(src_i0) * sizeof(double),
|
|
static_cast<size_t>(src_j0),
|
|
static_cast<size_t>(src_k0));
|
|
parms.dstPos = make_cudaPos(static_cast<size_t>(dst_x0) * sizeof(double),
|
|
static_cast<size_t>(dst_y0),
|
|
static_cast<size_t>(dst_z0));
|
|
parms.extent = make_cudaExtent(static_cast<size_t>(region_nx) * sizeof(double),
|
|
static_cast<size_t>(region_ny),
|
|
static_cast<size_t>(region_nz));
|
|
parms.kind = kind;
|
|
|
|
cudaError_t err = cudaMemcpy3D(&parms);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error(where, err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
inline int interp_idint_like_host(double x)
|
|
{
|
|
return static_cast<int>(x);
|
|
}
|
|
|
|
inline void lagrange_weights_ord6_host(double x, double *w)
|
|
{
|
|
static const double denom[6] = {-120.0, 24.0, -12.0, 12.0, -24.0, 120.0};
|
|
for (int i = 0; i < 6; ++i)
|
|
{
|
|
double num = 1.0;
|
|
for (int j = 0; j < 6; ++j)
|
|
{
|
|
if (j != i)
|
|
num *= (x - static_cast<double>(j));
|
|
}
|
|
w[i] = num / denom[i];
|
|
}
|
|
}
|
|
|
|
inline bool map_interp_index_host(int logical_idx, int n, int *mapped_idx, int *reflected)
|
|
{
|
|
int idx = logical_idx;
|
|
*reflected = 0;
|
|
if (idx < 0)
|
|
{
|
|
idx = -idx;
|
|
*reflected = 1;
|
|
}
|
|
if (idx < 0 || idx >= n)
|
|
return false;
|
|
*mapped_idx = idx;
|
|
return true;
|
|
}
|
|
|
|
inline bool compute_interp_window_host(double x, const double *coord, int n,
|
|
int ordn, int allow_reflect,
|
|
int *start_idx, double *cx)
|
|
{
|
|
const double dx = coord[1] - coord[0];
|
|
const int center = interp_idint_like_host((x - coord[0]) / dx + 0.4);
|
|
const int cmin = allow_reflect ? (-ordn / 2 + 1) : 0;
|
|
const int cmax = n - 1;
|
|
|
|
int begin = center - ordn / 2 + 1;
|
|
int end = begin + ordn - 1;
|
|
if (begin < cmin)
|
|
{
|
|
begin = cmin;
|
|
end = begin + ordn - 1;
|
|
}
|
|
if (end > cmax)
|
|
{
|
|
end = cmax;
|
|
begin = end + 1 - ordn;
|
|
}
|
|
|
|
if (begin >= 0)
|
|
*cx = (x - coord[begin]) / dx;
|
|
else
|
|
*cx = (x + coord[-begin]) / dx;
|
|
|
|
*start_idx = begin;
|
|
return (begin >= cmin && end <= cmax);
|
|
}
|
|
|
|
__global__ void enforce_ga_kernel(int n,
|
|
double *dxx, double *gxy, double *gxz,
|
|
double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz,
|
|
double *Ayy, double *Ayz, double *Azz)
|
|
{
|
|
const double one = 1.0;
|
|
const double two = 2.0;
|
|
const double one_third = 1.0 / 3.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
double lgxx = dxx[idx] + one;
|
|
double lgyy = dyy[idx] + one;
|
|
double lgzz = dzz[idx] + one;
|
|
double lgxy = gxy[idx];
|
|
double lgxz = gxz[idx];
|
|
double lgyz = gyz[idx];
|
|
|
|
double det = lgxx * lgyy * lgzz + lgxy * lgyz * lgxz + lgxz * lgxy * lgyz
|
|
- lgxz * lgyy * lgxz - lgxy * lgxy * lgzz - lgxx * lgyz * lgyz;
|
|
double scale = 1.0 / cbrt(det);
|
|
|
|
lgxx *= scale;
|
|
lgxy *= scale;
|
|
lgxz *= scale;
|
|
lgyy *= scale;
|
|
lgyz *= scale;
|
|
lgzz *= scale;
|
|
|
|
dxx[idx] = lgxx - one;
|
|
gxy[idx] = lgxy;
|
|
gxz[idx] = lgxz;
|
|
dyy[idx] = lgyy - one;
|
|
gyz[idx] = lgyz;
|
|
dzz[idx] = lgzz - one;
|
|
|
|
double gupxx = (lgyy * lgzz - lgyz * lgyz);
|
|
double gupxy = -(lgxy * lgzz - lgyz * lgxz);
|
|
double gupxz = (lgxy * lgyz - lgyy * lgxz);
|
|
double gupyy = (lgxx * lgzz - lgxz * lgxz);
|
|
double gupyz = -(lgxx * lgyz - lgxy * lgxz);
|
|
double gupzz = (lgxx * lgyy - lgxy * lgxy);
|
|
|
|
double trA = gupxx * Axx[idx] + gupyy * Ayy[idx] + gupzz * Azz[idx]
|
|
+ two * (gupxy * Axy[idx] + gupxz * Axz[idx] + gupyz * Ayz[idx]);
|
|
|
|
Axx[idx] -= one_third * lgxx * trA;
|
|
Axy[idx] -= one_third * lgxy * trA;
|
|
Axz[idx] -= one_third * lgxz * trA;
|
|
Ayy[idx] -= one_third * lgyy * trA;
|
|
Ayz[idx] -= one_third * lgyz * trA;
|
|
Azz[idx] -= one_third * lgzz * trA;
|
|
}
|
|
}
|
|
|
|
__device__ inline int index3(int i, int j, int k, int nx, int ny)
|
|
{
|
|
return i + j * nx + k * nx * ny;
|
|
}
|
|
|
|
__device__ inline double load_symmetry_ord1(const double *f, int i, int j, int k,
|
|
int nx, int ny, int nz,
|
|
const double soa[3], int symmetry)
|
|
{
|
|
double sign = 1.0;
|
|
if (i < 0)
|
|
{
|
|
i = -i - 1;
|
|
sign *= soa[0];
|
|
}
|
|
if (j < 0)
|
|
{
|
|
j = -j - 1;
|
|
sign *= soa[1];
|
|
}
|
|
if (k < 0)
|
|
{
|
|
k = -k - 1;
|
|
sign *= soa[2];
|
|
}
|
|
if (i >= nx) i = nx - 1;
|
|
if (j >= ny) j = ny - 1;
|
|
if (k >= nz) k = nz - 1;
|
|
(void)symmetry;
|
|
return sign * f[index3(i, j, k, nx, ny)];
|
|
}
|
|
|
|
__device__ inline double load_prolong_cell_padded(const double *f,
|
|
int i, int j, int k,
|
|
int sx, int sy)
|
|
{
|
|
return f[(i + 2) + (j + 2) * sx + (k + 2) * sx * sy];
|
|
}
|
|
|
|
__device__ inline double load_restrict_cell_padded(const double *f,
|
|
int i, int j, int k,
|
|
int sx, int sy)
|
|
{
|
|
return f[(i + 1) + (j + 1) * sx + (k + 1) * sx * sy];
|
|
}
|
|
|
|
__global__ void prolong3_cell_kernel(const double *funcc, double *funf,
|
|
int sxc, int syc,
|
|
int nxc, int nyc, int nzc,
|
|
int nxf, int nyf, int nzf,
|
|
int lbc0, int lbc1, int lbc2,
|
|
int lbf0, int lbf1, int lbf2,
|
|
int ibegin, int iend,
|
|
int jbegin, int jend,
|
|
int kbegin, int kend,
|
|
int *error_flag)
|
|
{
|
|
const double C1 = 7.7e1 / 8.192e3;
|
|
const double C2 = -6.93e2 / 8.192e3;
|
|
const double C3 = 3.465e3 / 4.096e3;
|
|
const double C4 = 1.155e3 / 4.096e3;
|
|
const double C5 = -4.95e2 / 8.192e3;
|
|
const double C6 = 6.3e1 / 8.192e3;
|
|
const double w_even[6] = {C1, C2, C3, C4, C5, C6};
|
|
const double w_odd[6] = {C6, C5, C4, C3, C2, C1};
|
|
|
|
const int n = nxf * nyf * nzf;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nxf * nyf;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nxf;
|
|
const int i = rem - j * nxf;
|
|
|
|
if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
|
|
continue;
|
|
|
|
const int ii = i + lbf0;
|
|
const int jj = j + lbf1;
|
|
const int kk = k + lbf2;
|
|
|
|
const int ic0 = ii / 2 - lbc0 - 1;
|
|
const int jc0 = jj / 2 - lbc1 - 1;
|
|
const int kc0 = kk / 2 - lbc2 - 1;
|
|
|
|
const double *wx = (ii & 1) ? w_odd : w_even;
|
|
const double *wy = (jj & 1) ? w_odd : w_even;
|
|
const double *wz = (kk & 1) ? w_odd : w_even;
|
|
|
|
double value = 0.0;
|
|
for (int ox = 0; ox < 6; ++ox)
|
|
{
|
|
const int cx = ic0 + ox;
|
|
double yz = 0.0;
|
|
for (int oy = 0; oy < 6; ++oy)
|
|
{
|
|
const int cy = jc0 + oy;
|
|
double zsum = 0.0;
|
|
for (int oz = 0; oz < 6; ++oz)
|
|
{
|
|
const int cz = kc0 + oz;
|
|
if (cx < -2 || cx > nxc || cy < -2 || cy > nyc || cz < -2 || cz > nzc)
|
|
{
|
|
if (error_flag)
|
|
*error_flag = 1;
|
|
zsum = 0.0;
|
|
yz = 0.0;
|
|
value = 0.0;
|
|
goto prolong3_cell_kernel_next_point;
|
|
}
|
|
zsum += wz[oz] * load_prolong_cell_padded(funcc, cx, cy, cz, sxc, syc);
|
|
}
|
|
yz += wy[oy] * zsum;
|
|
}
|
|
value += wx[ox] * yz;
|
|
}
|
|
|
|
funf[idx] = value;
|
|
prolong3_cell_kernel_next_point:
|
|
;
|
|
}
|
|
}
|
|
|
|
__global__ void prolong3_fill_equatorial_reflect_kernel(double *funcc,
|
|
int sxc, int syc,
|
|
int nxc, int nyc,
|
|
double z_soa)
|
|
{
|
|
const int n = nxc * nyc * 3;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nxc * nyc;
|
|
const int zoff = idx / plane;
|
|
const int rem = idx - zoff * plane;
|
|
const int j = rem / nxc;
|
|
const int i = rem - j * nxc;
|
|
|
|
const int li = i + 1;
|
|
const int lj = j + 1;
|
|
const int source_k = zoff + 1;
|
|
const int target_k = -zoff;
|
|
|
|
funcc[(li + 2) + (lj + 2) * sxc + (target_k + 2) * sxc * syc] =
|
|
z_soa * funcc[(li + 2) + (lj + 2) * sxc + (source_k + 2) * sxc * syc];
|
|
}
|
|
}
|
|
|
|
__global__ void restrict3_cell_kernel(const double *funf, double *func,
|
|
int sxf, int syf,
|
|
int nxf, int nyf, int nzf,
|
|
int nxc, int nyc, int nzc,
|
|
int lbc0, int lbc1, int lbc2,
|
|
int lbf0, int lbf1, int lbf2,
|
|
int ibegin, int iend,
|
|
int jbegin, int jend,
|
|
int kbegin, int kend,
|
|
int *error_flag)
|
|
{
|
|
const double C1 = 3.0 / 256.0;
|
|
const double C2 = -25.0 / 256.0;
|
|
const double C3 = 75.0 / 128.0;
|
|
const double w[6] = {C1, C2, C3, C3, C2, C1};
|
|
|
|
const int n = nxc * nyc * nzc;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nxc * nyc;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nxc;
|
|
const int i = rem - j * nxc;
|
|
|
|
if (i < ibegin || i > iend || j < jbegin || j > jend || k < kbegin || k > kend)
|
|
continue;
|
|
|
|
const int fi = 2 * (i + lbc0) - lbf0;
|
|
const int fj = 2 * (j + lbc1) - lbf1;
|
|
const int fk = 2 * (k + lbc2) - lbf2;
|
|
|
|
double value = 0.0;
|
|
for (int ox = 0; ox < 6; ++ox)
|
|
{
|
|
const int fx = fi - 2 + ox;
|
|
double yz = 0.0;
|
|
for (int oy = 0; oy < 6; ++oy)
|
|
{
|
|
const int fy = fj - 2 + oy;
|
|
double zsum = 0.0;
|
|
for (int oz = 0; oz < 6; ++oz)
|
|
{
|
|
const int fz = fk - 2 + oz;
|
|
if (fx < -1 || fx > nxf || fy < -1 || fy > nyf || fz < -1 || fz > nzf)
|
|
{
|
|
if (error_flag)
|
|
*error_flag = 1;
|
|
zsum = 0.0;
|
|
yz = 0.0;
|
|
value = 0.0;
|
|
goto restrict3_cell_kernel_next_point;
|
|
}
|
|
zsum += w[oz] * load_restrict_cell_padded(funf, fx, fy, fz, sxf, syf);
|
|
}
|
|
yz += w[oy] * zsum;
|
|
}
|
|
value += w[ox] * yz;
|
|
}
|
|
|
|
func[idx] = value;
|
|
restrict3_cell_kernel_next_point:
|
|
;
|
|
}
|
|
}
|
|
|
|
__global__ void interp_points_ord6_kernel(int num_points, int num_var,
|
|
int nx, int ny,
|
|
const double *const *fields,
|
|
const double *soa_flat,
|
|
const int *stencil_idx,
|
|
const int *stencil_reflect,
|
|
const double *stencil_weights,
|
|
double *out,
|
|
int *error_flag)
|
|
{
|
|
const int total = num_points * num_var;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int point_id = idx / num_var;
|
|
const int var_id = idx - point_id * num_var;
|
|
const double *field = fields[var_id];
|
|
const double *soa = soa_flat + 3 * var_id;
|
|
const int *idxp = stencil_idx + point_id * 18;
|
|
const int *refp = stencil_reflect + point_id * 18;
|
|
const double *wp = stencil_weights + point_id * 18;
|
|
|
|
double value = 0.0;
|
|
#pragma unroll
|
|
for (int kz = 0; kz < 6; ++kz)
|
|
{
|
|
const int z_idx = idxp[12 + kz];
|
|
const double z_sign = refp[12 + kz] ? soa[2] : 1.0;
|
|
double yz_sum = 0.0;
|
|
#pragma unroll
|
|
for (int jy = 0; jy < 6; ++jy)
|
|
{
|
|
const int y_idx = idxp[6 + jy];
|
|
const double y_sign = refp[6 + jy] ? soa[1] : 1.0;
|
|
double x_sum = 0.0;
|
|
#pragma unroll
|
|
for (int ix = 0; ix < 6; ++ix)
|
|
{
|
|
const int x_idx = idxp[ix];
|
|
const double x_sign = refp[ix] ? soa[0] : 1.0;
|
|
const double sign = x_sign * y_sign * z_sign;
|
|
x_sum += wp[ix] * sign * field[index3(x_idx, y_idx, z_idx, nx, ny)];
|
|
}
|
|
yz_sum += wp[6 + jy] * x_sum;
|
|
}
|
|
value += wp[12 + kz] * yz_sum;
|
|
}
|
|
out[idx] = value;
|
|
}
|
|
(void)error_flag;
|
|
}
|
|
|
|
__global__ void rk4_kernel(int n, double dT,
|
|
const double *f0,
|
|
double *f1,
|
|
double *rhs,
|
|
int stage)
|
|
{
|
|
const double half = 0.5;
|
|
const double one_sixth = 1.0 / 6.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
if (stage == 0)
|
|
{
|
|
f1[idx] = f0[idx] + half * dT * rhs[idx];
|
|
}
|
|
else if (stage == 1)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
f1[idx] = f0[idx] + half * dT * f1[idx];
|
|
}
|
|
else if (stage == 2)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
f1[idx] = f0[idx] + dT * f1[idx];
|
|
}
|
|
else
|
|
{
|
|
f1[idx] = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void lowerbound_kernel(int n, double *chi, double tinny)
|
|
{
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
if (chi[idx] < tinny)
|
|
chi[idx] = tinny;
|
|
}
|
|
}
|
|
|
|
__global__ void copy_physical_boundary_kernel(int nx, int ny, int nz,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
const double *src, double *dst)
|
|
{
|
|
const int n = nx * ny * nz;
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
|
|
if ((has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1))
|
|
{
|
|
dst[idx] = src[idx];
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void rk4_boundary_batch_kernel(int n, int nx, int ny, int nz,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
int num_var, double dT,
|
|
const double *const *state0_list,
|
|
double *const *stage_list,
|
|
double *const *rhs_list,
|
|
int stage)
|
|
{
|
|
const double half = 0.5;
|
|
const double one_sixth = 1.0 / 6.0;
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
const bool is_boundary =
|
|
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
|
|
|
|
for (int v = 0; v < num_var; ++v)
|
|
{
|
|
const double *f0 = state0_list[v];
|
|
double *f1 = stage_list[v];
|
|
double *rhs = rhs_list[v];
|
|
|
|
double out;
|
|
if (stage == 0)
|
|
{
|
|
out = f0[idx] + half * dT * rhs[idx];
|
|
}
|
|
else if (stage == 1)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
out = f0[idx] + half * dT * f1[idx];
|
|
}
|
|
else if (stage == 2)
|
|
{
|
|
rhs[idx] += 2.0 * f1[idx];
|
|
out = f0[idx] + dT * f1[idx];
|
|
}
|
|
else
|
|
{
|
|
out = f0[idx] + one_sixth * dT * (f1[idx] + rhs[idx]);
|
|
}
|
|
|
|
if (is_boundary)
|
|
out = f0[idx];
|
|
f1[idx] = out;
|
|
}
|
|
}
|
|
}
|
|
|
|
__global__ void sommerfeld_bam_kernel(int nx, int ny, int nz,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
int has_xmin, int has_ymin, int has_zmin,
|
|
int has_xmax, int has_ymax, int has_zmax,
|
|
int imin, int jmin, int kmin,
|
|
double propspeed,
|
|
const double *f0,
|
|
double *target,
|
|
double soa0, double soa1, double soa2,
|
|
int symmetry)
|
|
{
|
|
const double one = 1.0;
|
|
const double two = 2.0;
|
|
const int n = nx * ny * nz;
|
|
const double dX = X[1] - X[0];
|
|
const double dY = Y[1] - Y[0];
|
|
const double dZ = Z[1] - Z[0];
|
|
const double d2dx = one / (two * dX);
|
|
const double d2dy = one / (two * dY);
|
|
const double d2dz = one / (two * dZ);
|
|
const double soa[3] = {soa0, soa1, soa2};
|
|
|
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x)
|
|
{
|
|
const int plane = nx * ny;
|
|
const int k = idx / plane;
|
|
const int rem = idx - k * plane;
|
|
const int j = rem / nx;
|
|
const int i = rem - j * nx;
|
|
|
|
const bool on_boundary =
|
|
(has_xmin && i == 0) || (has_xmax && i == nx - 1) ||
|
|
(has_ymin && j == 0) || (has_ymax && j == ny - 1) ||
|
|
(has_zmin && k == 0) || (has_zmax && k == nz - 1);
|
|
|
|
if (!on_boundary)
|
|
continue;
|
|
|
|
const double radius = sqrt(X[i] * X[i] + Y[j] * Y[j] + Z[k] * Z[k]);
|
|
if (radius == 0.0)
|
|
continue;
|
|
|
|
const double wx = propspeed * X[i] / radius;
|
|
const double wy = propspeed * Y[j] / radius;
|
|
const double wz = propspeed * Z[k] / radius;
|
|
|
|
double fx = 0.0;
|
|
double fy = 0.0;
|
|
double fz = 0.0;
|
|
|
|
if (wx > 0.0)
|
|
{
|
|
if (i - 2 >= imin)
|
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (i - 1 >= imin)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wx < 0.0)
|
|
{
|
|
if (i + 2 <= nx - 1)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i + 2, j, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (i + 1 <= nx - 1)
|
|
fx = d2dx * (-load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i + 1, j, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fx = d2dx * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i - 1, j, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i - 2, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
if (wy > 0.0)
|
|
{
|
|
if (j - 2 >= jmin)
|
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
|
else if (j - 1 >= jmin)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wy < 0.0)
|
|
{
|
|
if (j + 2 <= ny - 1)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j + 2, k, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (j + 1 <= ny - 1)
|
|
fy = d2dy * (-load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j + 1, k, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fy = d2dy * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j - 1, k, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j - 2, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
if (wz > 0.0)
|
|
{
|
|
if (k - 2 >= kmin)
|
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
|
else if (k - 1 >= kmin)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
}
|
|
else if (wz < 0.0)
|
|
{
|
|
if (k + 2 <= nz - 1)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k + 2, nx, ny, nz, soa, symmetry)
|
|
+ 4.0 * load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry)
|
|
- 3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry));
|
|
else if (k + 1 <= nz - 1)
|
|
fz = d2dz * (-load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k + 1, nx, ny, nz, soa, symmetry));
|
|
else
|
|
fz = d2dz * (3.0 * load_symmetry_ord1(f0, i, j, k, nx, ny, nz, soa, symmetry)
|
|
- 4.0 * load_symmetry_ord1(f0, i, j, k - 1, nx, ny, nz, soa, symmetry)
|
|
+ load_symmetry_ord1(f0, i, j, k - 2, nx, ny, nz, soa, symmetry));
|
|
}
|
|
|
|
target[idx] = -propspeed * (fx * X[i] + fy * Y[j] + fz * Z[k] + f0[idx]) / radius;
|
|
}
|
|
}
|
|
|
|
inline bool launch_kernel(dim3 grid, dim3 block, const void *kernel, void **args)
|
|
{
|
|
cudaError_t err = cudaLaunchKernel(kernel, grid, block, args, 0, nullptr);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaLaunchKernel", err);
|
|
return false;
|
|
}
|
|
err = cudaPeekAtLastError();
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaPeekAtLastError", err);
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
int bssn_cuda_enforce_ga(int *ex,
|
|
double *dxx, double *gxy, double *gxz,
|
|
double *dyy, double *gyz, double *dzz,
|
|
double *Axx, double *Axy, double *Axz,
|
|
double *Ayy, double *Ayz, double *Azz)
|
|
{
|
|
struct EnforceGaCache
|
|
{
|
|
CachedBuffer dxx, gxy, gxz, dyy, gyz, dzz;
|
|
CachedBuffer Axx, Axy, Axz, Ayy, Ayz, Azz;
|
|
};
|
|
static thread_local EnforceGaCache cache;
|
|
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
bool ok = copy_to_device_preferring_device(cache.dxx, dxx, bytes) &&
|
|
copy_to_device_preferring_device(cache.gxy, gxy, bytes) &&
|
|
copy_to_device_preferring_device(cache.gxz, gxz, bytes) &&
|
|
copy_to_device_preferring_device(cache.dyy, dyy, bytes) &&
|
|
copy_to_device_preferring_device(cache.gyz, gyz, bytes) &&
|
|
copy_to_device_preferring_device(cache.dzz, dzz, bytes) &&
|
|
copy_to_device_preferring_device(cache.Axx, Axx, bytes) &&
|
|
copy_to_device_preferring_device(cache.Axy, Axy, bytes) &&
|
|
copy_to_device_preferring_device(cache.Axz, Axz, bytes) &&
|
|
copy_to_device_preferring_device(cache.Ayy, Ayy, bytes) &&
|
|
copy_to_device_preferring_device(cache.Ayz, Ayz, bytes) &&
|
|
copy_to_device_preferring_device(cache.Azz, Azz, bytes);
|
|
|
|
if (ok)
|
|
{
|
|
double *d_dxx = cache.dxx.ptr, *d_gxy = cache.gxy.ptr, *d_gxz = cache.gxz.ptr;
|
|
double *d_dyy = cache.dyy.ptr, *d_gyz = cache.gyz.ptr, *d_dzz = cache.dzz.ptr;
|
|
double *d_Axx = cache.Axx.ptr, *d_Axy = cache.Axy.ptr, *d_Axz = cache.Axz.ptr;
|
|
double *d_Ayy = cache.Ayy.ptr, *d_Ayz = cache.Ayz.ptr, *d_Azz = cache.Azz.ptr;
|
|
void *args[] = {&n, &d_dxx, &d_gxy, &d_gxz, &d_dyy, &d_gyz, &d_dzz,
|
|
&d_Axx, &d_Axy, &d_Axz, &d_Ayy, &d_Ayz, &d_Azz};
|
|
ok = launch_kernel(grid, block, (const void *)enforce_ga_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
// The next GPU RHS stage consumes these fields immediately.
|
|
// Keep them device-resident and expose the mapping so gpu_rhs() can
|
|
// reuse them via D2D copies instead of forcing an intermediate D2H round-trip.
|
|
bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr);
|
|
bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr);
|
|
bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr);
|
|
bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr);
|
|
bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr);
|
|
bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr);
|
|
bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr);
|
|
bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr);
|
|
bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr);
|
|
bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr);
|
|
bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr);
|
|
bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr);
|
|
}
|
|
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_rk4_boundary_var(int *ex, double dT,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
const double *state0,
|
|
const double *phi_field,
|
|
const double *lap_field,
|
|
const double *boundary_src,
|
|
double *stage_data,
|
|
double *rhs_accum,
|
|
double propspeed,
|
|
const double SoA[3],
|
|
int symmetry,
|
|
int lev,
|
|
int rk_stage,
|
|
bool force_host_boundary_fix,
|
|
bool download_to_host)
|
|
{
|
|
Rk4VarCache &cache = rk4_var_cache_map()[state0];
|
|
|
|
int nx = ex[0];
|
|
int ny = ex[1];
|
|
int nz = ex[2];
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
const size_t bytes_x = static_cast<size_t>(nx) * sizeof(double);
|
|
const size_t bytes_y = static_cast<size_t>(ny) * sizeof(double);
|
|
const size_t bytes_z = static_cast<size_t>(nz) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
const bool need_bam_boundary = (lev == 0);
|
|
const bool need_coord_copy = need_bam_boundary;
|
|
const bool need_boundary_input = need_bam_boundary && (rk_stage != 0);
|
|
const bool need_stage_input = (rk_stage != 0);
|
|
bool ok = true;
|
|
if (need_coord_copy &&
|
|
(cache.host_X != X || cache.host_Y != Y || cache.host_Z != Z ||
|
|
cache.nx != nx || cache.ny != ny || cache.nz != nz))
|
|
{
|
|
ok = copy_to_device(cache.X, X, bytes_x) &&
|
|
copy_to_device(cache.Y, Y, bytes_y) &&
|
|
copy_to_device(cache.Z, Z, bytes_z);
|
|
if (ok)
|
|
{
|
|
cache.host_X = X;
|
|
cache.host_Y = Y;
|
|
cache.host_Z = Z;
|
|
cache.nx = nx;
|
|
cache.ny = ny;
|
|
cache.nz = nz;
|
|
}
|
|
}
|
|
|
|
const bool refresh_state0 =
|
|
(rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
|
|
const bool refresh_rhs =
|
|
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
|
|
double *stage_ptr = nullptr;
|
|
const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
|
|
const double *mapped_boundary_ptr = need_boundary_input ? bssn_gpu_find_device_buffer(boundary_src) : nullptr;
|
|
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
|
|
const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;
|
|
|
|
if (refresh_state0 && !mapped_state0_ptr)
|
|
bssn_gpu_prepare_host_buffer(state0, n);
|
|
if (need_boundary_input && !mapped_boundary_ptr)
|
|
bssn_gpu_prepare_host_buffer(boundary_src, n);
|
|
if (need_stage_input && !mapped_stage_ptr)
|
|
bssn_gpu_prepare_host_buffer(stage_data, n);
|
|
if (refresh_rhs && !mapped_rhs_ptr)
|
|
bssn_gpu_prepare_host_buffer(rhs_accum, n);
|
|
|
|
ok = ok &&
|
|
(!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
|
(!need_boundary_input || copy_to_device_preferring_device(cache.boundary, boundary_src, bytes)) &&
|
|
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
|
|
|
if (ok && need_stage_input)
|
|
{
|
|
if (mapped_stage_ptr)
|
|
{
|
|
stage_ptr = const_cast<double *>(mapped_stage_ptr);
|
|
}
|
|
else
|
|
{
|
|
ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
|
|
stage_ptr = cache.stage.ptr;
|
|
}
|
|
}
|
|
else if (ok)
|
|
{
|
|
ok = ensure_capacity(cache.stage, bytes);
|
|
stage_ptr = cache.stage.ptr;
|
|
}
|
|
|
|
if (!ok)
|
|
return 1;
|
|
|
|
if (refresh_state0)
|
|
{
|
|
cache.host_state0 = state0;
|
|
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
|
|
}
|
|
if (refresh_rhs)
|
|
{
|
|
cache.host_rhs = rhs_accum;
|
|
cache.rhs_resident = true;
|
|
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
|
|
}
|
|
|
|
double dX = X[1] - X[0];
|
|
double dY = Y[1] - Y[0];
|
|
double dZ = Z[1] - Z[0];
|
|
const int no_symm = 0, eq_symm = 1, octant = 2;
|
|
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
|
|
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
|
|
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
|
|
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
|
|
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
|
|
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
|
|
double soa0 = SoA[0];
|
|
double soa1 = SoA[1];
|
|
double soa2 = SoA[2];
|
|
|
|
if (need_bam_boundary)
|
|
{
|
|
int imin = 1;
|
|
int jmin = 1;
|
|
int kmin = 1;
|
|
if (symmetry > no_symm && std::fabs(Z[0]) < dZ) kmin = 0;
|
|
if (symmetry > eq_symm && std::fabs(X[0]) < dX) imin = 0;
|
|
if (symmetry > eq_symm && std::fabs(Y[0]) < dY) jmin = 0;
|
|
|
|
double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr;
|
|
double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr;
|
|
double *d_rhs = cache.rhs.ptr;
|
|
double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr;
|
|
const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary;
|
|
void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z,
|
|
&xmin, &ymin, &zmin, &xmax, &ymax, &zmax,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&imin, &jmin, &kmin, &propspeed,
|
|
&bam_source, &bam_target,
|
|
&soa0, &soa1, &soa2,
|
|
&symmetry};
|
|
ok = launch_kernel(grid, block, (const void *)sommerfeld_bam_kernel, args);
|
|
}
|
|
|
|
if (ok && (lev == 0 || !force_host_boundary_fix))
|
|
{
|
|
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr;
|
|
void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage};
|
|
ok = launch_kernel(grid, block, (const void *)rk4_kernel, args);
|
|
}
|
|
|
|
if (ok && lev > 0 && !force_host_boundary_fix)
|
|
{
|
|
double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr;
|
|
void *args[] = {&nx, &ny, &nz,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&d_state0, &d_stage};
|
|
ok = launch_kernel(grid, block, (const void *)copy_physical_boundary_kernel, args);
|
|
}
|
|
|
|
if (ok && lev > 0 && force_host_boundary_fix)
|
|
{
|
|
double *host_state0 = const_cast<double *>(state0);
|
|
double *host_phi = const_cast<double *>(phi_field);
|
|
double *host_lap = const_cast<double *>(lap_field);
|
|
double *host_rhs = rhs_accum;
|
|
|
|
// state0/phi/lap are read-only during the current RK step, so the host copies
|
|
// remain valid even if cached device mirrors exist. Only the RHS accumulator
|
|
// is updated on device and must be synchronized back for the CPU fallback.
|
|
ok = sync_host_from_mapped_device(host_rhs, n, "cudaMemcpy(D2H) rhs_accum");
|
|
if (ok)
|
|
{
|
|
bssn_gpu_prepare_host_buffer(stage_data, n);
|
|
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
|
ok = false;
|
|
}
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
int rk_stage_host = rk_stage;
|
|
int cor = 1;
|
|
f_rungekutta4_rout(ex, dT, host_state0, stage_data, host_rhs, rk_stage_host);
|
|
f_sommerfeld_rout(ex,
|
|
const_cast<double *>(X), const_cast<double *>(Y), const_cast<double *>(Z),
|
|
xmin, ymin, zmin, xmax, ymax, zmax,
|
|
dT,
|
|
host_phi, host_lap,
|
|
host_state0, stage_data,
|
|
const_cast<double *>(SoA),
|
|
symmetry, cor);
|
|
|
|
cudaError_t err = cudaMemcpy(stage_ptr, stage_data, bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) stage_data sommerfeld", err);
|
|
ok = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
bssn_gpu_register_device_buffer(stage_data, stage_ptr);
|
|
if (download_to_host)
|
|
{
|
|
cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err);
|
|
ok = err == cudaSuccess;
|
|
}
|
|
}
|
|
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_rk4_boundary_batch(int *ex, double dT,
|
|
const double *X, const double *Y, const double *Z,
|
|
double xmin, double ymin, double zmin,
|
|
double xmax, double ymax, double zmax,
|
|
int symmetry,
|
|
const double *const *state0_list,
|
|
double *const *stage_data_list,
|
|
double *const *rhs_accum_list,
|
|
int num_var,
|
|
int rk_stage,
|
|
bool download_to_host)
|
|
{
|
|
if (!state0_list || !stage_data_list || !rhs_accum_list || num_var <= 0)
|
|
return 1;
|
|
|
|
const int nx = ex[0];
|
|
const int ny = ex[1];
|
|
const int nz = ex[2];
|
|
const int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
std::vector<const double *> host_state0_ptrs(num_var);
|
|
std::vector<double *> host_stage_ptrs(num_var);
|
|
std::vector<double *> host_rhs_ptrs(num_var);
|
|
|
|
bool ok = true;
|
|
for (int v = 0; v < num_var && ok; ++v)
|
|
{
|
|
const double *state0 = state0_list[v];
|
|
double *stage_data = stage_data_list[v];
|
|
double *rhs_accum = rhs_accum_list[v];
|
|
Rk4VarCache &cache = rk4_var_cache_map()[state0];
|
|
|
|
const bool refresh_state0 =
|
|
(rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz;
|
|
const bool refresh_rhs =
|
|
(rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum;
|
|
const bool need_stage_input = (rk_stage != 0);
|
|
double *stage_ptr = nullptr;
|
|
const double *mapped_state0_ptr = refresh_state0 ? bssn_gpu_find_device_buffer(state0) : cache.state0.ptr;
|
|
const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr;
|
|
const double *mapped_rhs_ptr = refresh_rhs ? bssn_gpu_find_device_buffer(rhs_accum) : cache.rhs.ptr;
|
|
|
|
if (refresh_state0 && !mapped_state0_ptr)
|
|
bssn_gpu_prepare_host_buffer(state0, n);
|
|
if (need_stage_input && !mapped_stage_ptr)
|
|
bssn_gpu_prepare_host_buffer(stage_data, n);
|
|
if (refresh_rhs && !mapped_rhs_ptr)
|
|
bssn_gpu_prepare_host_buffer(rhs_accum, n);
|
|
|
|
ok = (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) &&
|
|
(!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes));
|
|
if (!ok)
|
|
break;
|
|
|
|
if (need_stage_input)
|
|
{
|
|
if (mapped_stage_ptr)
|
|
{
|
|
stage_ptr = const_cast<double *>(mapped_stage_ptr);
|
|
}
|
|
else
|
|
{
|
|
ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes);
|
|
stage_ptr = cache.stage.ptr;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
ok = ensure_capacity(cache.stage, bytes);
|
|
stage_ptr = cache.stage.ptr;
|
|
}
|
|
if (!ok)
|
|
break;
|
|
|
|
if (refresh_state0)
|
|
{
|
|
cache.host_state0 = state0;
|
|
cache.nx = nx;
|
|
cache.ny = ny;
|
|
cache.nz = nz;
|
|
bssn_gpu_register_device_buffer(state0, cache.state0.ptr);
|
|
}
|
|
if (refresh_rhs)
|
|
{
|
|
cache.host_rhs = rhs_accum;
|
|
cache.rhs_resident = true;
|
|
bssn_gpu_register_device_buffer(rhs_accum, cache.rhs.ptr);
|
|
}
|
|
|
|
host_state0_ptrs[v] = cache.state0.ptr;
|
|
host_stage_ptrs[v] = stage_ptr;
|
|
host_rhs_ptrs[v] = cache.rhs.ptr;
|
|
}
|
|
|
|
if (!ok)
|
|
return 1;
|
|
|
|
Rk4BatchCache &batch_cache = rk4_batch_cache();
|
|
ok = ensure_capacity(batch_cache.state0_ptrs, ptr_bytes) &&
|
|
ensure_capacity(batch_cache.stage_ptrs, ptr_bytes) &&
|
|
ensure_capacity(batch_cache.rhs_ptrs, ptr_bytes);
|
|
if (!ok)
|
|
return 1;
|
|
|
|
cudaError_t err = cudaMemcpy(batch_cache.state0_ptrs.ptr, &host_state0_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) batch state0 ptrs", err);
|
|
return 1;
|
|
}
|
|
err = cudaMemcpy(batch_cache.stage_ptrs.ptr, &host_stage_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) batch stage ptrs", err);
|
|
return 1;
|
|
}
|
|
err = cudaMemcpy(batch_cache.rhs_ptrs.ptr, &host_rhs_ptrs[0], ptr_bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) batch rhs ptrs", err);
|
|
return 1;
|
|
}
|
|
|
|
double dX = X[1] - X[0];
|
|
double dY = Y[1] - Y[0];
|
|
double dZ = Z[1] - Z[0];
|
|
const int no_symm = 0, octant = 2;
|
|
int has_xmax = (std::fabs(X[nx - 1] - xmax) < dX);
|
|
int has_ymax = (std::fabs(Y[ny - 1] - ymax) < dY);
|
|
int has_zmax = (std::fabs(Z[nz - 1] - zmax) < dZ);
|
|
int has_xmin = (std::fabs(X[0] - xmin) < dX) && !(symmetry == octant && std::fabs(xmin) < dX / 2.0);
|
|
int has_ymin = (std::fabs(Y[0] - ymin) < dY) && !(symmetry == octant && std::fabs(ymin) < dY / 2.0);
|
|
int has_zmin = (std::fabs(Z[0] - zmin) < dZ) && !(symmetry > no_symm && std::fabs(zmin) < dZ / 2.0);
|
|
|
|
int n_arg = n, nx_arg = nx, ny_arg = ny, nz_arg = nz;
|
|
int num_var_arg = num_var, rk_stage_arg = rk_stage;
|
|
void *args[] = {&n_arg, &nx_arg, &ny_arg, &nz_arg,
|
|
&has_xmin, &has_ymin, &has_zmin,
|
|
&has_xmax, &has_ymax, &has_zmax,
|
|
&num_var_arg, &dT,
|
|
&batch_cache.state0_ptrs.ptr,
|
|
&batch_cache.stage_ptrs.ptr,
|
|
&batch_cache.rhs_ptrs.ptr,
|
|
&rk_stage_arg};
|
|
ok = launch_kernel(grid, block, (const void *)rk4_boundary_batch_kernel, args);
|
|
|
|
if (!ok)
|
|
return 1;
|
|
|
|
for (int v = 0; v < num_var; ++v)
|
|
{
|
|
bssn_gpu_register_device_buffer(stage_data_list[v], host_stage_ptrs[v]);
|
|
if (download_to_host)
|
|
{
|
|
err = cudaMemcpy(stage_data_list[v], host_stage_ptrs[v], bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) batch stage_data", err);
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
void bssn_cuda_release_rk4_caches()
|
|
{
|
|
std::unordered_map<const double *, Rk4VarCache> &cache_map = rk4_var_cache_map();
|
|
for (std::unordered_map<const double *, Rk4VarCache>::iterator it = cache_map.begin();
|
|
it != cache_map.end(); ++it)
|
|
{
|
|
Rk4VarCache &cache = it->second;
|
|
release_buffer(cache.X);
|
|
release_buffer(cache.Y);
|
|
release_buffer(cache.Z);
|
|
release_buffer(cache.state0);
|
|
release_buffer(cache.boundary);
|
|
release_buffer(cache.stage);
|
|
release_buffer(cache.rhs);
|
|
}
|
|
cache_map.clear();
|
|
release_buffer(rk4_batch_cache().state0_ptrs);
|
|
release_buffer(rk4_batch_cache().stage_ptrs);
|
|
release_buffer(rk4_batch_cache().rhs_ptrs);
|
|
}
|
|
|
|
void bssn_cuda_release_interp_caches()
|
|
{
|
|
release_interp_batch_cache(interp_batch_cache());
|
|
}
|
|
|
|
int bssn_cuda_lowerbound(int *ex, double *chi, double tinny, bool download_to_host)
|
|
{
|
|
static thread_local CachedBuffer d_chi;
|
|
|
|
int n = count_points(ex);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
dim3 block(256);
|
|
dim3 grid(div_up(n, static_cast<int>(block.x)));
|
|
|
|
double *device_chi = nullptr;
|
|
const double *mapped = bssn_gpu_find_device_buffer(chi);
|
|
bool ok = true;
|
|
if (mapped)
|
|
{
|
|
device_chi = const_cast<double *>(mapped);
|
|
}
|
|
else
|
|
{
|
|
ok = copy_to_device_preferring_device(d_chi, chi, bytes);
|
|
device_chi = d_chi.ptr;
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
double *ptr = device_chi;
|
|
void *args[] = {&n, &ptr, &tinny};
|
|
ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args);
|
|
}
|
|
|
|
if (ok)
|
|
{
|
|
bssn_gpu_register_device_buffer(chi, device_chi);
|
|
if (download_to_host)
|
|
{
|
|
bssn_gpu_prepare_host_buffer(chi, n);
|
|
cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err);
|
|
ok = err == cudaSuccess;
|
|
}
|
|
}
|
|
return ok ? 0 : 1;
|
|
}
|
|
|
|
int bssn_cuda_download_buffer(int *ex, double *host_ptr)
|
|
{
|
|
const double *device_ptr = bssn_gpu_find_device_buffer(host_ptr);
|
|
if (!device_ptr)
|
|
return 1;
|
|
|
|
const int n = count_points(ex);
|
|
bssn_gpu_prepare_host_buffer(host_ptr, n);
|
|
const size_t bytes = static_cast<size_t>(n) * sizeof(double);
|
|
cudaError_t err = cudaMemcpy(host_ptr, device_ptr, bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) buffered download", err);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int bssn_cuda_interp_points_batch(const int *ex,
|
|
const double *X, const double *Y, const double *Z,
|
|
const double *const *fields,
|
|
const double *soa_flat,
|
|
int num_var,
|
|
const double *px, const double *py, const double *pz,
|
|
int num_points,
|
|
int ordn,
|
|
int symmetry,
|
|
double *out)
|
|
{
|
|
if (!ex || !X || !Y || !Z || !fields || !soa_flat || !px || !py || !pz || !out)
|
|
return 1;
|
|
if (num_var <= 0 || num_points <= 0 || ordn != 6)
|
|
return 1;
|
|
if (ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
|
return 1;
|
|
|
|
InterpBatchCache &cache = interp_batch_cache();
|
|
|
|
const int nx = ex[0];
|
|
const int ny = ex[1];
|
|
const int nz = ex[2];
|
|
const int field_points = count_points(ex);
|
|
const size_t field_bytes = static_cast<size_t>(field_points) * sizeof(double);
|
|
const size_t out_bytes = static_cast<size_t>(num_points) * static_cast<size_t>(num_var) * sizeof(double);
|
|
const size_t soa_bytes = static_cast<size_t>(3 * num_var) * sizeof(double);
|
|
const size_t ptr_bytes = static_cast<size_t>(num_var) * sizeof(double *);
|
|
const size_t point_stencil_doubles = static_cast<size_t>(num_points) * 18;
|
|
const size_t point_stencil_ints = static_cast<size_t>(num_points) * 18;
|
|
const size_t weights_bytes = point_stencil_doubles * sizeof(double);
|
|
const size_t indices_bytes = point_stencil_ints * sizeof(int);
|
|
|
|
bool ok = true;
|
|
InterpStencilCacheEntry &stencil_cache = cache.stencil_entry;
|
|
const bool stencil_match =
|
|
stencil_cache.valid &&
|
|
stencil_cache.X == X && stencil_cache.Y == Y && stencil_cache.Z == Z &&
|
|
stencil_cache.px == px && stencil_cache.py == py && stencil_cache.pz == pz &&
|
|
stencil_cache.nx == nx && stencil_cache.ny == ny && stencil_cache.nz == nz &&
|
|
stencil_cache.num_points == num_points && stencil_cache.ordn == ordn &&
|
|
stencil_cache.symmetry == symmetry;
|
|
|
|
if (!stencil_match)
|
|
{
|
|
release_interp_stencil_cache(stencil_cache);
|
|
stencil_cache.X = X;
|
|
stencil_cache.Y = Y;
|
|
stencil_cache.Z = Z;
|
|
stencil_cache.px = px;
|
|
stencil_cache.py = py;
|
|
stencil_cache.pz = pz;
|
|
stencil_cache.nx = nx;
|
|
stencil_cache.ny = ny;
|
|
stencil_cache.nz = nz;
|
|
stencil_cache.num_points = num_points;
|
|
stencil_cache.ordn = ordn;
|
|
stencil_cache.symmetry = symmetry;
|
|
stencil_cache.valid = true;
|
|
|
|
std::vector<double> host_weights(point_stencil_doubles);
|
|
std::vector<int> host_indices(point_stencil_ints);
|
|
std::vector<int> host_reflect(point_stencil_ints);
|
|
const double dx = X[1] - X[0];
|
|
const double dy = Y[1] - Y[0];
|
|
const double dz = Z[1] - Z[0];
|
|
const int allow_reflect_x = (symmetry == 2 && std::fabs(X[0]) < dx);
|
|
const int allow_reflect_y = (symmetry == 2 && std::fabs(Y[0]) < dy);
|
|
const int allow_reflect_z = (symmetry != 0 && std::fabs(Z[0]) < dz);
|
|
for (int p = 0; p < num_points; ++p)
|
|
{
|
|
int start_x = 0, start_y = 0, start_z = 0;
|
|
double cx = 0.0, cy = 0.0, cz = 0.0;
|
|
const bool ok_x = compute_interp_window_host(px[p], X, nx, ordn, allow_reflect_x, &start_x, &cx);
|
|
const bool ok_y = compute_interp_window_host(py[p], Y, ny, ordn, allow_reflect_y, &start_y, &cy);
|
|
const bool ok_z = compute_interp_window_host(pz[p], Z, nz, ordn, allow_reflect_z, &start_z, &cz);
|
|
if (!ok_x || !ok_y || !ok_z)
|
|
return 1;
|
|
|
|
lagrange_weights_ord6_host(cx, host_weights.data() + p * 18);
|
|
lagrange_weights_ord6_host(cy, host_weights.data() + p * 18 + 6);
|
|
lagrange_weights_ord6_host(cz, host_weights.data() + p * 18 + 12);
|
|
|
|
for (int i = 0; i < 6; ++i)
|
|
{
|
|
if (!map_interp_index_host(start_x + i, nx, &host_indices[p * 18 + i], &host_reflect[p * 18 + i]))
|
|
return 1;
|
|
if (!map_interp_index_host(start_y + i, ny, &host_indices[p * 18 + 6 + i], &host_reflect[p * 18 + 6 + i]))
|
|
return 1;
|
|
if (!map_interp_index_host(start_z + i, nz, &host_indices[p * 18 + 12 + i], &host_reflect[p * 18 + 12 + i]))
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
ok = ok &&
|
|
copy_to_device(stencil_cache.weights, host_weights.data(), weights_bytes) &&
|
|
copy_to_device(stencil_cache.indices, host_indices.data(), indices_bytes) &&
|
|
copy_to_device(stencil_cache.reflect, host_reflect.data(), indices_bytes);
|
|
if (!ok)
|
|
return 1;
|
|
}
|
|
|
|
ok = ok &&
|
|
copy_to_device(cache.soa, soa_flat, soa_bytes) &&
|
|
ensure_capacity(cache.out, out_bytes) &&
|
|
ensure_capacity(cache.field_ptrs, ptr_bytes) &&
|
|
ensure_capacity(cache.error_flag, sizeof(int));
|
|
if (!ok)
|
|
return 1;
|
|
|
|
if (static_cast<int>(cache.host_field_copies.size()) < num_var)
|
|
cache.host_field_copies.resize(num_var);
|
|
|
|
std::vector<const double *> device_fields(num_var);
|
|
for (int v = 0; v < num_var; ++v)
|
|
{
|
|
const double *device_field = bssn_gpu_find_device_buffer(fields[v]);
|
|
if (!device_field)
|
|
{
|
|
ok = copy_to_device(cache.host_field_copies[v], fields[v], field_bytes);
|
|
device_field = cache.host_field_copies[v].ptr;
|
|
}
|
|
device_fields[v] = device_field;
|
|
if (!ok || !device_fields[v])
|
|
return 1;
|
|
}
|
|
|
|
int zero = 0;
|
|
cudaError_t err = cudaMemcpy(cache.field_ptrs.ptr, device_fields.data(), ptr_bytes, cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) field_ptrs", err);
|
|
return 1;
|
|
}
|
|
err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) interp_error_flag", err);
|
|
return 1;
|
|
}
|
|
|
|
dim3 block(128);
|
|
dim3 grid(div_up(num_points * num_var, static_cast<int>(block.x)));
|
|
if (grid.x > 4096)
|
|
grid.x = 4096;
|
|
|
|
int nx_local = nx;
|
|
int ny_local = ny;
|
|
const double *dsoa = cache.soa.ptr;
|
|
const double *const *dfields = reinterpret_cast<const double *const *>(cache.field_ptrs.ptr);
|
|
const double *dweights = stencil_cache.weights.ptr;
|
|
const int *dindices = stencil_cache.indices.ptr;
|
|
const int *dreflect = stencil_cache.reflect.ptr;
|
|
double *dout = cache.out.ptr;
|
|
int *derror = cache.error_flag.ptr;
|
|
|
|
void *args[] = {
|
|
&num_points, &num_var,
|
|
&nx_local, &ny_local,
|
|
&dfields,
|
|
&dsoa,
|
|
&dindices,
|
|
&dreflect,
|
|
&dweights,
|
|
&dout,
|
|
&derror};
|
|
ok = launch_kernel(grid, block, (const void *)interp_points_ord6_kernel, args);
|
|
if (!ok)
|
|
return 1;
|
|
|
|
int error_flag = 0;
|
|
err = cudaMemcpy(&error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) interp_error_flag", err);
|
|
return 1;
|
|
}
|
|
if (error_flag != 0)
|
|
return 1;
|
|
|
|
err = cudaMemcpy(out, cache.out.ptr, out_bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) interp_out", err);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int bssn_cuda_prolong3_pack(int wei,
|
|
const double *llbc, const double *uubc, const int *extc, const double *func,
|
|
const double *llbf, const double *uubf, const int *extf, double *funf,
|
|
const double *llbp, const double *uubp,
|
|
const double *SoA, int symmetry)
|
|
{
|
|
const int no_symm = 0;
|
|
const int eq_symm = 1;
|
|
|
|
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbp || !uubp || !SoA)
|
|
return 1;
|
|
|
|
// Keep the CPU fallback for octant and any unrecognized symmetry modes.
|
|
if (symmetry != no_symm && symmetry != eq_symm)
|
|
return 1;
|
|
|
|
auto idint_like = [](double x) -> int {
|
|
return static_cast<int>(std::trunc(x));
|
|
};
|
|
|
|
double base[3];
|
|
double CD[3];
|
|
double FD[3];
|
|
int lbc[3], ubc[3], lbf[3], ubf[3], lbp[3], ubp[3], lbpc[3], ubpc[3];
|
|
for (int d = 0; d < 3; ++d)
|
|
{
|
|
CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
|
|
FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
|
|
if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
|
|
return 1;
|
|
|
|
if (llbc[d] <= llbf[d])
|
|
{
|
|
base[d] = llbc[d];
|
|
}
|
|
else
|
|
{
|
|
int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
|
|
base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
|
|
}
|
|
|
|
lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
|
|
ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
|
|
lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
|
|
ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
|
|
lbp[d] = idint_like((llbp[d] - base[d]) / FD[d] + 0.4) + 1;
|
|
lbpc[d] = idint_like((llbp[d] - base[d]) / CD[d] + 0.4) + 1;
|
|
ubp[d] = idint_like((uubp[d] - base[d]) / FD[d] + 0.4);
|
|
ubpc[d] = idint_like((uubp[d] - base[d]) / CD[d] + 0.4);
|
|
(void)ubc[d];
|
|
(void)ubf[d];
|
|
}
|
|
|
|
const int imino = lbp[0] - lbf[0] + 1;
|
|
const int imaxo = ubp[0] - lbf[0] + 1;
|
|
const int jmino = lbp[1] - lbf[1] + 1;
|
|
const int jmaxo = ubp[1] - lbf[1] + 1;
|
|
const int kmino = lbp[2] - lbf[2] + 1;
|
|
const int kmaxo = ubp[2] - lbf[2] + 1;
|
|
|
|
const int imini = lbpc[0] - lbc[0] + 1;
|
|
const int imaxi = ubpc[0] - lbc[0] + 1;
|
|
const int jmini = lbpc[1] - lbc[1] + 1;
|
|
const int jmaxi = ubpc[1] - lbc[1] + 1;
|
|
const int kmini = lbpc[2] - lbc[2] + 1;
|
|
const int kmaxi = ubpc[2] - lbc[2] + 1;
|
|
|
|
if (imino < 1 || jmino < 1 || kmino < 1 ||
|
|
imini < 1 || jmini < 1 || kmini < 1 ||
|
|
imaxo > extf[0] || jmaxo > extf[1] || kmaxo > extf[2] ||
|
|
imaxi > extc[0] - 2 || jmaxi > extc[1] - 2 || kmaxi > extc[2] - 2)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
const bool thin_x = (extf[0] <= 2 * wei);
|
|
const bool thin_y = (extf[1] <= 2 * wei);
|
|
const bool thin_z = (extf[2] <= 2 * wei);
|
|
// Keep thin slabs on the CPU path until the dedicated slab kernel is
|
|
// fully validated. The first z-thin specialization still triggers illegal
|
|
// accesses on the common extf=(48,48,6) case.
|
|
if (thin_x || thin_y || thin_z)
|
|
return 1;
|
|
|
|
auto coarse_center_index = [](int fine_idx, int lbf_d, int lbc_d) -> int {
|
|
int ii = fine_idx + lbf_d - 1;
|
|
return ii / 2 - lbc_d + 1;
|
|
};
|
|
const int ic_min = coarse_center_index(imino, lbf[0], lbc[0]);
|
|
const int ic_max = coarse_center_index(imaxo, lbf[0], lbc[0]);
|
|
const int jc_min = coarse_center_index(jmino, lbf[1], lbc[1]);
|
|
const int jc_max = coarse_center_index(jmaxo, lbf[1], lbc[1]);
|
|
const int kc_min = coarse_center_index(kmino, lbf[2], lbc[2]);
|
|
const int kc_max = coarse_center_index(kmaxo, lbf[2], lbc[2]);
|
|
const bool need_equatorial_z_reflect = (symmetry == eq_symm && kc_min - 2 < 1);
|
|
|
|
// Match the active Fortran implementation's later stencil safety check.
|
|
if (ic_max + 3 > extc[0] || jc_max + 3 > extc[1] || kc_max + 3 > extc[2] ||
|
|
ic_min < 1 || jc_min < 1 || kc_min < 0)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
// Match the active Cell-centered Fortran fast path: only the z<0
|
|
// reflection needed by equatorial symmetry is handled here.
|
|
if (ic_min - 2 < 1 || jc_min - 2 < 1 || (kc_min - 2 < 1 && !need_equatorial_z_reflect))
|
|
return 1;
|
|
|
|
struct ProlongCache
|
|
{
|
|
CachedBuffer coarse;
|
|
CachedBuffer fine;
|
|
CachedIntBuffer error_flag;
|
|
};
|
|
static thread_local ProlongCache cache;
|
|
|
|
int nxf = extf[0], nyf = extf[1], nzf = extf[2];
|
|
const int stage_ix_lo = ic_min - 2;
|
|
const int stage_ix_hi = ic_max + 3;
|
|
const int stage_iy_lo = jc_min - 2;
|
|
const int stage_iy_hi = jc_max + 3;
|
|
const int stage_iz_lo = need_equatorial_z_reflect ? 1 : (kc_min - 2);
|
|
const int stage_iz_hi = kc_max + 3;
|
|
int nxc = stage_ix_hi - stage_ix_lo + 1;
|
|
int nyc = stage_iy_hi - stage_iy_lo + 1;
|
|
int nzc = stage_iz_hi - stage_iz_lo + 1;
|
|
int sxc = nxc + 3;
|
|
int syc = nyc + 3;
|
|
int szc = nzc + 3;
|
|
int fine_points = nxf * nyf * nzf;
|
|
const size_t fine_bytes = static_cast<size_t>(fine_points) * sizeof(double);
|
|
if (!copy_region_to_padded_stage(cache.coarse,
|
|
func,
|
|
extc,
|
|
stage_ix_lo - 1,
|
|
stage_iy_lo - 1,
|
|
stage_iz_lo - 1,
|
|
nxc,
|
|
nyc,
|
|
nzc,
|
|
sxc,
|
|
syc,
|
|
szc,
|
|
3,
|
|
3,
|
|
3,
|
|
"cudaMemcpy3D prolong3 stage"))
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
double *d_func = cache.coarse.ptr;
|
|
|
|
if (need_equatorial_z_reflect)
|
|
{
|
|
dim3 reflect_block(256);
|
|
dim3 reflect_grid(div_up(nxc * nyc * 3, static_cast<int>(reflect_block.x)));
|
|
double z_soa = SoA[2];
|
|
void *reflect_args[] = {
|
|
&d_func,
|
|
&sxc,
|
|
&syc,
|
|
&nxc,
|
|
&nyc,
|
|
&z_soa};
|
|
if (!launch_kernel(reflect_grid, reflect_block,
|
|
(const void *)prolong3_fill_equatorial_reflect_kernel,
|
|
reflect_args))
|
|
{
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
if (!ensure_capacity(cache.fine, fine_bytes))
|
|
{
|
|
return 1;
|
|
}
|
|
if (!ensure_capacity(cache.error_flag, sizeof(int)))
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
int zero = 0;
|
|
cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) prolong3_error_flag", err);
|
|
return 1;
|
|
}
|
|
|
|
dim3 block(256);
|
|
dim3 grid(div_up(fine_points, static_cast<int>(block.x)));
|
|
double *d_funf = cache.fine.ptr;
|
|
int lbc_local[3] = {lbc[0] + stage_ix_lo - 1,
|
|
lbc[1] + stage_iy_lo - 1,
|
|
lbc[2] + stage_iz_lo - 1};
|
|
int ibegin = imino - 1, iend = imaxo - 1;
|
|
int jbegin = jmino - 1, jend = jmaxo - 1;
|
|
int kbegin = kmino - 1, kend = kmaxo - 1;
|
|
void *args[] = {&d_func, &d_funf,
|
|
&sxc, &syc,
|
|
&nxc, &nyc, &nzc,
|
|
&nxf, &nyf, &nzf,
|
|
&lbc_local[0], &lbc_local[1], &lbc_local[2],
|
|
&lbf[0], &lbf[1], &lbf[2],
|
|
&ibegin, &iend,
|
|
&jbegin, &jend,
|
|
&kbegin, &kend,
|
|
&cache.error_flag.ptr};
|
|
|
|
if (!launch_kernel(grid, block, (const void *)prolong3_cell_kernel, args))
|
|
return 1;
|
|
|
|
int host_error_flag = 0;
|
|
err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) prolong3_error_flag", err);
|
|
return 1;
|
|
}
|
|
if (host_error_flag)
|
|
return 1;
|
|
|
|
err = cudaMemcpy(funf, cache.fine.ptr, fine_bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) prolong3", err);
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int bssn_cuda_restrict3_pack(int wei,
|
|
const double *llbc, const double *uubc, const int *extc, double *func,
|
|
const double *llbf, const double *uubf, const int *extf, const double *funf,
|
|
const double *llbr, const double *uubr,
|
|
const double *SoA, int symmetry)
|
|
{
|
|
const int no_symm = 0;
|
|
const int eq_symm = 1;
|
|
|
|
if (wei != 3 || !llbc || !uubc || !extc || !func || !llbf || !uubf || !extf || !funf || !llbr || !uubr || !SoA)
|
|
return 1;
|
|
|
|
if (symmetry != no_symm && symmetry != eq_symm)
|
|
return 1;
|
|
|
|
auto idint_like = [](double x) -> int {
|
|
return static_cast<int>(std::trunc(x));
|
|
};
|
|
|
|
double base[3];
|
|
double CD[3];
|
|
double FD[3];
|
|
int lbc[3], ubc[3], lbf[3], ubf[3], lbr_i[3], ubr_i[3], lbrf[3], ubrf[3];
|
|
for (int d = 0; d < 3; ++d)
|
|
{
|
|
CD[d] = (uubc[d] - llbc[d]) / static_cast<double>(extc[d]);
|
|
FD[d] = (uubf[d] - llbf[d]) / static_cast<double>(extf[d]);
|
|
if (std::fabs(CD[d] - 2.0 * FD[d]) > 1.0e-10)
|
|
return 1;
|
|
|
|
if (llbc[d] <= llbf[d])
|
|
{
|
|
base[d] = llbc[d];
|
|
}
|
|
else
|
|
{
|
|
int j = idint_like((llbc[d] - llbf[d]) / FD[d] + 0.4);
|
|
base[d] = (j / 2 * 2 == j) ? llbf[d] : (llbf[d] - CD[d] / 2.0);
|
|
}
|
|
|
|
lbf[d] = idint_like((llbf[d] - base[d]) / FD[d] + 0.4) + 1;
|
|
ubf[d] = idint_like((uubf[d] - base[d]) / FD[d] + 0.4);
|
|
lbc[d] = idint_like((llbc[d] - base[d]) / CD[d] + 0.4) + 1;
|
|
ubc[d] = idint_like((uubc[d] - base[d]) / CD[d] + 0.4);
|
|
lbr_i[d] = idint_like((llbr[d] - base[d]) / CD[d] + 0.4) + 1;
|
|
lbrf[d] = idint_like((llbr[d] - base[d]) / FD[d] + 0.4) + 1;
|
|
ubr_i[d] = idint_like((uubr[d] - base[d]) / CD[d] + 0.4);
|
|
ubrf[d] = idint_like((uubr[d] - base[d]) / FD[d] + 0.4);
|
|
(void)ubc[d];
|
|
(void)ubf[d];
|
|
}
|
|
|
|
const int imino = lbr_i[0] - lbc[0] + 1;
|
|
const int imaxo = ubr_i[0] - lbc[0] + 1;
|
|
const int jmino = lbr_i[1] - lbc[1] + 1;
|
|
const int jmaxo = ubr_i[1] - lbc[1] + 1;
|
|
const int kmino = lbr_i[2] - lbc[2] + 1;
|
|
const int kmaxo = ubr_i[2] - lbc[2] + 1;
|
|
|
|
const int imini = lbrf[0] - lbf[0] + 1;
|
|
const int imaxi = ubrf[0] - lbf[0] + 1;
|
|
const int jmini = lbrf[1] - lbf[1] + 1;
|
|
const int jmaxi = ubrf[1] - lbf[1] + 1;
|
|
const int kmini = lbrf[2] - lbf[2] + 1;
|
|
const int kmaxi = ubrf[2] - lbf[2] + 1;
|
|
|
|
if (imino < 1 || jmino < 1 || kmino < 1 ||
|
|
imini < 1 || jmini < 1 || kmini < 1 ||
|
|
imaxo > extc[0] || jmaxo > extc[1] || kmaxo > extc[2] ||
|
|
imaxi > extf[0] - 2 || jmaxi > extf[1] - 2 || kmaxi > extf[2] - 2)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
// Keep thin restrict slabs on the established CPU path for now.
|
|
if (extc[0] <= 2 * wei || extc[1] <= 2 * wei || extc[2] <= 2 * wei ||
|
|
extf[0] <= 2 * wei || extf[1] <= 2 * wei || extf[2] <= 2 * wei)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
const int fi_min = 2 * (imino + lbc[0] - 1) - lbf[0];
|
|
const int fi_max = 2 * (imaxo + lbc[0] - 1) - lbf[0];
|
|
const int fj_min = 2 * (jmino + lbc[1] - 1) - lbf[1];
|
|
const int fj_max = 2 * (jmaxo + lbc[1] - 1) - lbf[1];
|
|
const int fk_min = 2 * (kmino + lbc[2] - 1) - lbf[2];
|
|
const int fk_max = 2 * (kmaxo + lbc[2] - 1) - lbf[2];
|
|
|
|
const int ii_lo = fi_min - 2;
|
|
const int ii_hi = fi_max + 3;
|
|
const int jj_lo = fj_min - 2;
|
|
const int jj_hi = fj_max + 3;
|
|
const int kk_lo = fk_min - 2;
|
|
const int kk_hi = fk_max + 3;
|
|
|
|
if (ii_hi > extf[0] || jj_hi > extf[1] || kk_hi > extf[2] ||
|
|
ii_lo < -1 || jj_lo < -1 || kk_lo < -1)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
const bool need_equatorial_z_reflect = (symmetry == eq_symm && kk_lo < 1);
|
|
// Keep symmetry-reflected restrict windows on the CPU path for now.
|
|
if (ii_lo < 1 || jj_lo < 1 || kk_lo < 1)
|
|
return 1;
|
|
|
|
struct RestrictCache
|
|
{
|
|
CachedBuffer fine;
|
|
CachedBuffer coarse;
|
|
CachedIntBuffer error_flag;
|
|
};
|
|
static thread_local RestrictCache cache;
|
|
|
|
int nxc = extc[0];
|
|
int nyc = extc[1];
|
|
int nzc = extc[2];
|
|
const int stage_fx_lo = ii_lo;
|
|
const int stage_fx_hi = ii_hi;
|
|
const int stage_fy_lo = jj_lo;
|
|
const int stage_fy_hi = jj_hi;
|
|
const int stage_fz_lo = kk_lo;
|
|
const int stage_fz_hi = kk_hi;
|
|
int nxf = stage_fx_hi - stage_fx_lo + 1;
|
|
int nyf = stage_fy_hi - stage_fy_lo + 1;
|
|
int nzf = stage_fz_hi - stage_fz_lo + 1;
|
|
int sxf = nxf + 2;
|
|
int syf = nyf + 2;
|
|
int szf = nzf + 2;
|
|
int coarse_points = nxc * nyc * nzc;
|
|
const size_t coarse_bytes = static_cast<size_t>(coarse_points) * sizeof(double);
|
|
(void)need_equatorial_z_reflect;
|
|
|
|
if (!copy_region_to_padded_stage(cache.fine,
|
|
funf,
|
|
extf,
|
|
stage_fx_lo - 1,
|
|
stage_fy_lo - 1,
|
|
stage_fz_lo - 1,
|
|
nxf,
|
|
nyf,
|
|
nzf,
|
|
sxf,
|
|
syf,
|
|
szf,
|
|
2,
|
|
2,
|
|
2,
|
|
"cudaMemcpy3D restrict3 stage"))
|
|
return 1;
|
|
if (!ensure_capacity(cache.coarse, coarse_bytes))
|
|
return 1;
|
|
if (!ensure_capacity(cache.error_flag, sizeof(int)))
|
|
return 1;
|
|
|
|
int zero = 0;
|
|
cudaError_t err = cudaMemcpy(cache.error_flag.ptr, &zero, sizeof(int), cudaMemcpyHostToDevice);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(H2D) restrict3_error_flag", err);
|
|
return 1;
|
|
}
|
|
|
|
dim3 block(256);
|
|
dim3 grid(div_up(coarse_points, static_cast<int>(block.x)));
|
|
double *d_funf = cache.fine.ptr;
|
|
double *d_func = cache.coarse.ptr;
|
|
int lbf_local[3] = {lbf[0] + stage_fx_lo - 1,
|
|
lbf[1] + stage_fy_lo - 1,
|
|
lbf[2] + stage_fz_lo - 1};
|
|
int ibegin = imino - 1, iend = imaxo - 1;
|
|
int jbegin = jmino - 1, jend = jmaxo - 1;
|
|
int kbegin = kmino - 1, kend = kmaxo - 1;
|
|
void *args[] = {&d_funf, &d_func,
|
|
&sxf, &syf,
|
|
&nxf, &nyf, &nzf,
|
|
&nxc, &nyc, &nzc,
|
|
&lbc[0], &lbc[1], &lbc[2],
|
|
&lbf_local[0], &lbf_local[1], &lbf_local[2],
|
|
&ibegin, &iend,
|
|
&jbegin, &jend,
|
|
&kbegin, &kend,
|
|
&cache.error_flag.ptr};
|
|
|
|
if (!launch_kernel(grid, block, (const void *)restrict3_cell_kernel, args))
|
|
return 1;
|
|
|
|
int host_error_flag = 0;
|
|
err = cudaMemcpy(&host_error_flag, cache.error_flag.ptr, sizeof(int), cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) restrict3_error_flag", err);
|
|
return 1;
|
|
}
|
|
if (host_error_flag)
|
|
return 1;
|
|
|
|
err = cudaMemcpy(func, cache.coarse.ptr, coarse_bytes, cudaMemcpyDeviceToHost);
|
|
if (err != cudaSuccess)
|
|
{
|
|
report_cuda_error("cudaMemcpy(D2H) restrict3", err);
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
}
|