Add optional CUDA surface interpolation

This commit is contained in:
2026-04-30 19:21:19 +08:00
parent 6835608f92
commit be9033f449
3 changed files with 270 additions and 6 deletions

View File

@@ -14,6 +14,9 @@ using namespace std;
#include "MPatch.h"
#include "Parallel.h"
#include "fmisc.h"
#if USE_CUDA_BSSN
#include "bssn_rhs_cuda.h"
#endif
#ifdef INTERP_LB_PROFILE
#include "interp_lb_profile.h"
#endif
@@ -195,6 +198,17 @@ bool interp_fast_enabled()
return enabled != 0;
}
bool interp_gpu_enabled()
{
static int enabled = -1;
if (enabled < 0)
{
const char *env = getenv("AMSS_INTERP_GPU");
enabled = (env && atoi(env) != 0) ? 1 : 0;
}
return enabled != 0;
}
bool interp_fast_compare_enabled()
{
static int enabled = -1;
@@ -1113,6 +1127,101 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
// --- Interpolation phase (identical to original) ---
#if USE_CUDA_BSSN
const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 &&
VarList && VarList->next && !VarList->next->next;
#else
const bool use_gpu_interp = false;
#endif
if (use_gpu_interp)
{
#if USE_CUDA_BSSN
vector<vector<int> > local_points(block_index.views.size());
for (int j = 0; j < NN; j++)
{
for (int i = 0; i < dim; i++)
{
if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i]))
{
cout << "Patch::Interp_Points: point (";
for (int k = 0; k < dim; k++)
{
cout << XX[k][j];
if (k < dim - 1)
cout << ",";
else
cout << ") is out of current Patch." << endl;
}
MPI_Abort(MPI_COMM_WORLD, 1);
}
}
CachedInterpPoint &cp = surface_cache->points[j];
Block *BP = cp.bp;
owner_rank[j] = cp.owner_rank;
if (BP && myrank == BP->rank)
{
for (size_t bi = 0; bi < block_index.views.size(); bi++)
{
if (block_index.views[bi].bp == BP)
{
local_points[bi].push_back(j);
break;
}
}
}
}
var *v0 = VarList->data;
var *v1 = VarList->next->data;
double soa6[6] = {
v0->SoA[0], v0->SoA[1], v0->SoA[2],
v1->SoA[0], v1->SoA[1], v1->SoA[2]};
for (size_t bi = 0; bi < local_points.size(); bi++)
{
const int count = int(local_points[bi].size());
if (count <= 0)
continue;
Block *BP = block_index.views[bi].bp;
vector<double> px(count), py(count), pz(count), out(2 * count);
for (int q = 0; q < count; q++)
{
const int j = local_points[bi][q];
px[q] = XX[0][j];
py[q] = XX[1][j];
pz[q] = XX[2][j];
}
const double dx = BP->X[0][1] - BP->X[0][0];
const double dy = BP->X[1][1] - BP->X[1][0];
const double dz = BP->X[2][1] - BP->X[2][0];
const int ok = bssn_cuda_interp_host_two_fields(
BP, BP->shape,
BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn],
BP->X[0][0], BP->X[1][0], BP->X[2][0],
dx, dy, dz,
&px[0], &py[0], &pz[0], count,
ordn, Symmetry, soa6, &out[0]);
if (ok != 0)
{
if (myrank == 0)
cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
for (int q = 0; q < count; q++)
{
const int j = local_points[bi][q];
Shellf[j * num_var] = out[2 * q];
Shellf[j * num_var + 1] = out[2 * q + 1];
}
}
#endif
}
else
{
for (int j = 0; j < NN; j++)
{
double pox[dim];
@@ -1183,6 +1292,7 @@ void Patch::Interp_Points(MyList<var> *VarList,
}
}
}
}
#ifdef INTERP_LB_PROFILE
double t_interp_end = MPI_Wtime();

View File

@@ -5765,6 +5765,65 @@ __global__ void kern_interp_state_point3(const double * __restrict__ mem,
out[f] = value;
}
__global__ void kern_interp_host_two_fields(const double * __restrict__ field0,
const double * __restrict__ field1,
const double * __restrict__ px,
const double * __restrict__ py,
const double * __restrict__ pz,
double * __restrict__ out,
int nx,
int ny,
int nz,
int all,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
int npoints,
int ordn,
int symmetry,
double soa00, double soa01, double soa02,
double soa10, double soa11, double soa12)
{
const int p = blockIdx.x * blockDim.x + threadIdx.x;
if (p >= npoints || ordn <= 0 || ordn > 8)
return;
int ib, jb, kb;
double tx, ty, tz;
interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx);
interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty);
interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz);
double wx[8], wy[8], wz[8];
for (int i = 0; i < ordn; ++i) {
wx[i] = interp_lagrange_weight(i, tx, ordn);
wy[i] = interp_lagrange_weight(i, ty, ordn);
wz[i] = interp_lagrange_weight(i, tz, ordn);
}
double v0 = 0.0;
double v1 = 0.0;
const double soa0[3] = {soa00, soa01, soa02};
const double soa1[3] = {soa10, soa11, soa12};
for (int k = 0; k < ordn; ++k) {
for (int j = 0; j < ordn; ++j) {
const double wyz = wy[j] * wz[k];
for (int i = 0; i < ordn; ++i) {
const double coeff = wx[i] * wyz;
v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0,
ib + i, jb + j, kb + k, soa0);
v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0,
ib + i, jb + j, kb + k, soa1);
}
}
}
out[2 * p] = v0;
out[2 * p + 1] = v1;
}
__global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem,
double * __restrict__ dst,
int nx, int ny,
@@ -7076,6 +7135,82 @@ int bssn_cuda_interp_state_point3(void *block_tag,
return 0;
}
extern "C"
int bssn_cuda_interp_host_two_fields(void *block_tag,
int *ex,
double *field0,
double *field1,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
const double *px,
const double *py,
const double *pz,
int npoints,
int ordn,
int symmetry,
const double *soa6,
double *out_interleaved)
{
(void)block_tag;
init_gpu_dispatch();
CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 ||
!out_interleaved || npoints <= 0)
return 1;
if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 ||
ordn <= 0 || ordn > 8 ||
ex[0] < ordn || ex[1] < ordn || ex[2] < ordn)
return 1;
const int all = ex[0] * ex[1] * ex[2];
const size_t field_bytes = (size_t)all * sizeof(double);
const size_t point_bytes = (size_t)npoints * sizeof(double);
const size_t out_bytes = (size_t)2 * npoints * sizeof(double);
double *d_field0 = nullptr;
double *d_field1 = nullptr;
double *d_px = nullptr;
double *d_py = nullptr;
double *d_pz = nullptr;
double *d_out = nullptr;
CUDA_CHECK(cudaMalloc(&d_field0, field_bytes));
CUDA_CHECK(cudaMalloc(&d_field1, field_bytes));
CUDA_CHECK(cudaMalloc(&d_px, point_bytes));
CUDA_CHECK(cudaMalloc(&d_py, point_bytes));
CUDA_CHECK(cudaMalloc(&d_pz, point_bytes));
CUDA_CHECK(cudaMalloc(&d_out, out_bytes));
CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice));
const int threads = 256;
const int blocks = (npoints + threads - 1) / threads;
kern_interp_host_two_fields<<<blocks, threads>>>(
d_field0, d_field1, d_px, d_py, d_pz, d_out,
ex[0], ex[1], ex[2], all,
x0, y0, z0, dx, dy, dz,
npoints, ordn, symmetry,
soa6[0], soa6[1], soa6[2],
soa6[3], soa6[4], soa6[5]);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost));
cudaFree(d_out);
cudaFree(d_pz);
cudaFree(d_py);
cudaFree(d_px);
cudaFree(d_field1);
cudaFree(d_field0);
return 0;
}
extern "C"
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,

View File

@@ -102,6 +102,25 @@ int bssn_cuda_interp_state_point3(void *block_tag,
const double *soa3,
double *out3);
int bssn_cuda_interp_host_two_fields(void *block_tag,
int *ex,
double *field0,
double *field1,
double x0,
double y0,
double z0,
double dx,
double dy,
double dz,
const double *px,
const double *py,
const double *pz,
int npoints,
int ordn,
int symmetry,
const double *soa6,
double *out_interleaved);
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,
double *host_buffer,