Add optional CUDA surface interpolation
This commit is contained in:
@@ -11,12 +11,15 @@
|
||||
using namespace std;
|
||||
|
||||
#include "misc.h"
|
||||
#include "MPatch.h"
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
#include "interp_lb_profile.h"
|
||||
#endif
|
||||
#include "MPatch.h"
|
||||
#include "Parallel.h"
|
||||
#include "fmisc.h"
|
||||
#if USE_CUDA_BSSN
|
||||
#include "bssn_rhs_cuda.h"
|
||||
#endif
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
#include "interp_lb_profile.h"
|
||||
#endif
|
||||
|
||||
namespace
|
||||
{
|
||||
@@ -195,6 +198,17 @@ bool interp_fast_enabled()
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool interp_gpu_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_INTERP_GPU");
|
||||
enabled = (env && atoi(env) != 0) ? 1 : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
bool interp_fast_compare_enabled()
|
||||
{
|
||||
static int enabled = -1;
|
||||
@@ -1113,6 +1127,101 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
|
||||
// --- Interpolation phase (identical to original) ---
|
||||
#if USE_CUDA_BSSN
|
||||
const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 &&
|
||||
VarList && VarList->next && !VarList->next->next;
|
||||
#else
|
||||
const bool use_gpu_interp = false;
|
||||
#endif
|
||||
if (use_gpu_interp)
|
||||
{
|
||||
#if USE_CUDA_BSSN
|
||||
vector<vector<int> > local_points(block_index.views.size());
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
for (int i = 0; i < dim; i++)
|
||||
{
|
||||
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
|
||||
{
|
||||
cout << "Patch::Interp_Points: point (";
|
||||
for (int k = 0; k < dim; k++)
|
||||
{
|
||||
cout << XX[k][j];
|
||||
if (k < dim - 1)
|
||||
cout << ",";
|
||||
else
|
||||
cout << ") is out of current Patch." << endl;
|
||||
}
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
}
|
||||
|
||||
CachedInterpPoint &cp = surface_cache->points[j];
|
||||
Block *BP = cp.bp;
|
||||
owner_rank[j] = cp.owner_rank;
|
||||
if (BP && myrank == BP->rank)
|
||||
{
|
||||
for (size_t bi = 0; bi < block_index.views.size(); bi++)
|
||||
{
|
||||
if (block_index.views[bi].bp == BP)
|
||||
{
|
||||
local_points[bi].push_back(j);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var *v0 = VarList->data;
|
||||
var *v1 = VarList->next->data;
|
||||
double soa6[6] = {
|
||||
v0->SoA[0], v0->SoA[1], v0->SoA[2],
|
||||
v1->SoA[0], v1->SoA[1], v1->SoA[2]};
|
||||
|
||||
for (size_t bi = 0; bi < local_points.size(); bi++)
|
||||
{
|
||||
const int count = int(local_points[bi].size());
|
||||
if (count <= 0)
|
||||
continue;
|
||||
|
||||
Block *BP = block_index.views[bi].bp;
|
||||
vector<double> px(count), py(count), pz(count), out(2 * count);
|
||||
for (int q = 0; q < count; q++)
|
||||
{
|
||||
const int j = local_points[bi][q];
|
||||
px[q] = XX[0][j];
|
||||
py[q] = XX[1][j];
|
||||
pz[q] = XX[2][j];
|
||||
}
|
||||
|
||||
const double dx = BP->X[0][1] - BP->X[0][0];
|
||||
const double dy = BP->X[1][1] - BP->X[1][0];
|
||||
const double dz = BP->X[2][1] - BP->X[2][0];
|
||||
const int ok = bssn_cuda_interp_host_two_fields(
|
||||
BP, BP->shape,
|
||||
BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn],
|
||||
BP->X[0][0], BP->X[1][0], BP->X[2][0],
|
||||
dx, dy, dz,
|
||||
&px[0], &py[0], &pz[0], count,
|
||||
ordn, Symmetry, soa6, &out[0]);
|
||||
if (ok != 0)
|
||||
{
|
||||
if (myrank == 0)
|
||||
cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
|
||||
for (int q = 0; q < count; q++)
|
||||
{
|
||||
const int j = local_points[bi][q];
|
||||
Shellf[j * num_var] = out[2 * q];
|
||||
Shellf[j * num_var + 1] = out[2 * q + 1];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j < NN; j++)
|
||||
{
|
||||
double pox[dim];
|
||||
@@ -1183,6 +1292,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef INTERP_LB_PROFILE
|
||||
double t_interp_end = MPI_Wtime();
|
||||
|
||||
@@ -5765,6 +5765,65 @@ __global__ void kern_interp_state_point3(const double * __restrict__ mem,
|
||||
out[f] = value;
|
||||
}
|
||||
|
||||
__global__ void kern_interp_host_two_fields(const double * __restrict__ field0,
|
||||
const double * __restrict__ field1,
|
||||
const double * __restrict__ px,
|
||||
const double * __restrict__ py,
|
||||
const double * __restrict__ pz,
|
||||
double * __restrict__ out,
|
||||
int nx,
|
||||
int ny,
|
||||
int nz,
|
||||
int all,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
int npoints,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
double soa00, double soa01, double soa02,
|
||||
double soa10, double soa11, double soa12)
|
||||
{
|
||||
const int p = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if (p >= npoints || ordn <= 0 || ordn > 8)
|
||||
return;
|
||||
|
||||
int ib, jb, kb;
|
||||
double tx, ty, tz;
|
||||
interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx);
|
||||
interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty);
|
||||
interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz);
|
||||
|
||||
double wx[8], wy[8], wz[8];
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
wx[i] = interp_lagrange_weight(i, tx, ordn);
|
||||
wy[i] = interp_lagrange_weight(i, ty, ordn);
|
||||
wz[i] = interp_lagrange_weight(i, tz, ordn);
|
||||
}
|
||||
|
||||
double v0 = 0.0;
|
||||
double v1 = 0.0;
|
||||
const double soa0[3] = {soa00, soa01, soa02};
|
||||
const double soa1[3] = {soa10, soa11, soa12};
|
||||
for (int k = 0; k < ordn; ++k) {
|
||||
for (int j = 0; j < ordn; ++j) {
|
||||
const double wyz = wy[j] * wz[k];
|
||||
for (int i = 0; i < ordn; ++i) {
|
||||
const double coeff = wx[i] * wyz;
|
||||
v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0,
|
||||
ib + i, jb + j, kb + k, soa0);
|
||||
v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0,
|
||||
ib + i, jb + j, kb + k, soa1);
|
||||
}
|
||||
}
|
||||
}
|
||||
out[2 * p] = v0;
|
||||
out[2 * p + 1] = v1;
|
||||
}
|
||||
|
||||
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
|
||||
double * __restrict__ dst,
|
||||
int nx, int ny,
|
||||
@@ -7076,6 +7135,82 @@ int bssn_cuda_interp_state_point3(void *block_tag,
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_interp_host_two_fields(void *block_tag,
|
||||
int *ex,
|
||||
double *field0,
|
||||
double *field1,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
const double *px,
|
||||
const double *py,
|
||||
const double *pz,
|
||||
int npoints,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
const double *soa6,
|
||||
double *out_interleaved)
|
||||
{
|
||||
(void)block_tag;
|
||||
init_gpu_dispatch();
|
||||
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
|
||||
if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 ||
|
||||
!out_interleaved || npoints <= 0)
|
||||
return 1;
|
||||
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
|
||||
ordn <= 0 || ordn > 8 ||
|
||||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
|
||||
return 1;
|
||||
|
||||
const int all = ex[0] * ex[1] * ex[2];
|
||||
const size_t field_bytes = (size_t)all * sizeof(double);
|
||||
const size_t point_bytes = (size_t)npoints * sizeof(double);
|
||||
const size_t out_bytes = (size_t)2 * npoints * sizeof(double);
|
||||
|
||||
double *d_field0 = nullptr;
|
||||
double *d_field1 = nullptr;
|
||||
double *d_px = nullptr;
|
||||
double *d_py = nullptr;
|
||||
double *d_pz = nullptr;
|
||||
double *d_out = nullptr;
|
||||
CUDA_CHECK(cudaMalloc(&d_field0, field_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_field1, field_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_px, point_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_py, point_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_pz, point_bytes));
|
||||
CUDA_CHECK(cudaMalloc(&d_out, out_bytes));
|
||||
|
||||
CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice));
|
||||
CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice));
|
||||
|
||||
const int threads = 256;
|
||||
const int blocks = (npoints + threads - 1) / threads;
|
||||
kern_interp_host_two_fields<<<blocks, threads>>>(
|
||||
d_field0, d_field1, d_px, d_py, d_pz, d_out,
|
||||
ex[0], ex[1], ex[2], all,
|
||||
x0, y0, z0, dx, dy, dz,
|
||||
npoints, ordn, symmetry,
|
||||
soa6[0], soa6[1], soa6[2],
|
||||
soa6[3], soa6[4], soa6[5]);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost));
|
||||
|
||||
cudaFree(d_out);
|
||||
cudaFree(d_pz);
|
||||
cudaFree(d_py);
|
||||
cudaFree(d_px);
|
||||
cudaFree(d_field1);
|
||||
cudaFree(d_field0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C"
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
|
||||
@@ -102,6 +102,25 @@ int bssn_cuda_interp_state_point3(void *block_tag,
|
||||
const double *soa3,
|
||||
double *out3);
|
||||
|
||||
int bssn_cuda_interp_host_two_fields(void *block_tag,
|
||||
int *ex,
|
||||
double *field0,
|
||||
double *field1,
|
||||
double x0,
|
||||
double y0,
|
||||
double z0,
|
||||
double dx,
|
||||
double dy,
|
||||
double dz,
|
||||
const double *px,
|
||||
const double *py,
|
||||
const double *pz,
|
||||
int npoints,
|
||||
int ordn,
|
||||
int symmetry,
|
||||
const double *soa6,
|
||||
double *out_interleaved);
|
||||
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
double *host_buffer,
|
||||
|
||||
Reference in New Issue
Block a user