diff --git a/AMSS_NCKU_source/MPatch.C b/AMSS_NCKU_source/MPatch.C index 81ff61f..05e5b65 100644 --- a/AMSS_NCKU_source/MPatch.C +++ b/AMSS_NCKU_source/MPatch.C @@ -11,12 +11,15 @@ using namespace std; #include "misc.h" -#include "MPatch.h" -#include "Parallel.h" -#include "fmisc.h" -#ifdef INTERP_LB_PROFILE -#include "interp_lb_profile.h" -#endif +#include "MPatch.h" +#include "Parallel.h" +#include "fmisc.h" +#if USE_CUDA_BSSN +#include "bssn_rhs_cuda.h" +#endif +#ifdef INTERP_LB_PROFILE +#include "interp_lb_profile.h" +#endif namespace { @@ -195,6 +198,17 @@ bool interp_fast_enabled() return enabled != 0; } +bool interp_gpu_enabled() +{ + static int enabled = -1; + if (enabled < 0) + { + const char *env = getenv("AMSS_INTERP_GPU"); + enabled = (env && atoi(env) != 0) ? 1 : 0; + } + return enabled != 0; +} + bool interp_fast_compare_enabled() { static int enabled = -1; @@ -1113,6 +1127,101 @@ void Patch::Interp_Points(MyList *VarList, } // --- Interpolation phase (identical to original) --- +#if USE_CUDA_BSSN + const bool use_gpu_interp = interp_gpu_enabled() && use_surface_cache && num_var == 2 && + VarList && VarList->next && !VarList->next->next; +#else + const bool use_gpu_interp = false; +#endif + if (use_gpu_interp) + { +#if USE_CUDA_BSSN + vector > local_points(block_index.views.size()); + for (int j = 0; j < NN; j++) + { + for (int i = 0; i < dim; i++) + { + if (myrank == 0 && (XX[i][j] < bbox[i] + lli[i] * DH[i] || XX[i][j] > bbox[dim + i] - uui[i] * DH[i])) + { + cout << "Patch::Interp_Points: point ("; + for (int k = 0; k < dim; k++) + { + cout << XX[k][j]; + if (k < dim - 1) + cout << ","; + else + cout << ") is out of current Patch." << endl; + } + MPI_Abort(MPI_COMM_WORLD, 1); + } + } + + CachedInterpPoint &cp = surface_cache->points[j]; + Block *BP = cp.bp; + owner_rank[j] = cp.owner_rank; + if (BP && myrank == BP->rank) + { + for (size_t bi = 0; bi < block_index.views.size(); bi++) + { + if (block_index.views[bi].bp == BP) + { + local_points[bi].push_back(j); + break; + } + } + } + } + + var *v0 = VarList->data; + var *v1 = VarList->next->data; + double soa6[6] = { + v0->SoA[0], v0->SoA[1], v0->SoA[2], + v1->SoA[0], v1->SoA[1], v1->SoA[2]}; + + for (size_t bi = 0; bi < local_points.size(); bi++) + { + const int count = int(local_points[bi].size()); + if (count <= 0) + continue; + + Block *BP = block_index.views[bi].bp; + vector px(count), py(count), pz(count), out(2 * count); + for (int q = 0; q < count; q++) + { + const int j = local_points[bi][q]; + px[q] = XX[0][j]; + py[q] = XX[1][j]; + pz[q] = XX[2][j]; + } + + const double dx = BP->X[0][1] - BP->X[0][0]; + const double dy = BP->X[1][1] - BP->X[1][0]; + const double dz = BP->X[2][1] - BP->X[2][0]; + const int ok = bssn_cuda_interp_host_two_fields( + BP, BP->shape, + BP->fgfs[v0->sgfn], BP->fgfs[v1->sgfn], + BP->X[0][0], BP->X[1][0], BP->X[2][0], + dx, dy, dz, + &px[0], &py[0], &pz[0], count, + ordn, Symmetry, soa6, &out[0]); + if (ok != 0) + { + if (myrank == 0) + cout << "Patch::Interp_Points: CUDA two-field interpolation failed" << endl; + MPI_Abort(MPI_COMM_WORLD, 1); + } + + for (int q = 0; q < count; q++) + { + const int j = local_points[bi][q]; + Shellf[j * num_var] = out[2 * q]; + Shellf[j * num_var + 1] = out[2 * q + 1]; + } + } +#endif + } + else + { for (int j = 0; j < NN; j++) { double pox[dim]; @@ -1183,6 +1292,7 @@ void Patch::Interp_Points(MyList *VarList, } } } + } #ifdef INTERP_LB_PROFILE double t_interp_end = MPI_Wtime(); diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 0135ad0..c61df51 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -5765,6 +5765,65 @@ __global__ void kern_interp_state_point3(const double * __restrict__ mem, out[f] = value; } +__global__ void kern_interp_host_two_fields(const double * __restrict__ field0, + const double * __restrict__ field1, + const double * __restrict__ px, + const double * __restrict__ py, + const double * __restrict__ pz, + double * __restrict__ out, + int nx, + int ny, + int nz, + int all, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + int npoints, + int ordn, + int symmetry, + double soa00, double soa01, double soa02, + double soa10, double soa11, double soa12) +{ + const int p = blockIdx.x * blockDim.x + threadIdx.x; + if (p >= npoints || ordn <= 0 || ordn > 8) + return; + + int ib, jb, kb; + double tx, ty, tz; + interp_axis_window(px[p], x0, dx, nx, ordn, symmetry, 0, ib, tx); + interp_axis_window(py[p], y0, dy, ny, ordn, symmetry, 1, jb, ty); + interp_axis_window(pz[p], z0, dz, nz, ordn, symmetry, 2, kb, tz); + + double wx[8], wy[8], wz[8]; + for (int i = 0; i < ordn; ++i) { + wx[i] = interp_lagrange_weight(i, tx, ordn); + wy[i] = interp_lagrange_weight(i, ty, ordn); + wz[i] = interp_lagrange_weight(i, tz, ordn); + } + + double v0 = 0.0; + double v1 = 0.0; + const double soa0[3] = {soa00, soa01, soa02}; + const double soa1[3] = {soa10, soa11, soa12}; + for (int k = 0; k < ordn; ++k) { + for (int j = 0; j < ordn; ++j) { + const double wyz = wy[j] * wz[k]; + for (int i = 0; i < ordn; ++i) { + const double coeff = wx[i] * wyz; + v0 += coeff * load_interp_value(field0, nx, ny, nz, all, 0, + ib + i, jb + j, kb + k, soa0); + v1 += coeff * load_interp_value(field1, nx, ny, nz, all, 0, + ib + i, jb + j, kb + k, soa1); + } + } + } + out[2 * p] = v0; + out[2 * p + 1] = v1; +} + __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem, double * __restrict__ dst, int nx, int ny, @@ -7076,6 +7135,82 @@ int bssn_cuda_interp_state_point3(void *block_tag, return 0; } +extern "C" +int bssn_cuda_interp_host_two_fields(void *block_tag, + int *ex, + double *field0, + double *field1, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + const double *px, + const double *py, + const double *pz, + int npoints, + int ordn, + int symmetry, + const double *soa6, + double *out_interleaved) +{ + (void)block_tag; + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + if (!ex || !field0 || !field1 || !px || !py || !pz || !soa6 || + !out_interleaved || npoints <= 0) + return 1; + if (ex[0] <= 0 || ex[1] <= 0 || ex[2] <= 0 || + ordn <= 0 || ordn > 8 || + ex[0] < ordn || ex[1] < ordn || ex[2] < ordn) + return 1; + + const int all = ex[0] * ex[1] * ex[2]; + const size_t field_bytes = (size_t)all * sizeof(double); + const size_t point_bytes = (size_t)npoints * sizeof(double); + const size_t out_bytes = (size_t)2 * npoints * sizeof(double); + + double *d_field0 = nullptr; + double *d_field1 = nullptr; + double *d_px = nullptr; + double *d_py = nullptr; + double *d_pz = nullptr; + double *d_out = nullptr; + CUDA_CHECK(cudaMalloc(&d_field0, field_bytes)); + CUDA_CHECK(cudaMalloc(&d_field1, field_bytes)); + CUDA_CHECK(cudaMalloc(&d_px, point_bytes)); + CUDA_CHECK(cudaMalloc(&d_py, point_bytes)); + CUDA_CHECK(cudaMalloc(&d_pz, point_bytes)); + CUDA_CHECK(cudaMalloc(&d_out, out_bytes)); + + CUDA_CHECK(cudaMemcpy(d_field0, field0, field_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_field1, field1, field_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_px, px, point_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_py, py, point_bytes, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_pz, pz, point_bytes, cudaMemcpyHostToDevice)); + + const int threads = 256; + const int blocks = (npoints + threads - 1) / threads; + kern_interp_host_two_fields<<>>( + d_field0, d_field1, d_px, d_py, d_pz, d_out, + ex[0], ex[1], ex[2], all, + x0, y0, z0, dx, dy, dz, + npoints, ordn, symmetry, + soa6[0], soa6[1], soa6[2], + soa6[3], soa6[4], soa6[5]); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaMemcpy(out_interleaved, d_out, out_bytes, cudaMemcpyDeviceToHost)); + + cudaFree(d_out); + cudaFree(d_pz); + cudaFree(d_py); + cudaFree(d_px); + cudaFree(d_field1); + cudaFree(d_field0); + return 0; +} + extern "C" int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int state_index, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index c3a5195..66ce74c 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -102,6 +102,25 @@ int bssn_cuda_interp_state_point3(void *block_tag, const double *soa3, double *out3); +int bssn_cuda_interp_host_two_fields(void *block_tag, + int *ex, + double *field0, + double *field1, + double x0, + double y0, + double z0, + double dx, + double dy, + double dz, + const double *px, + const double *py, + const double *pz, + int npoints, + int ordn, + int symmetry, + const double *soa6, + double *out_interleaved); + int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag, int state_index, double *host_buffer,