From bd4ce3fbf3876e72ddc9eb36093f3d3d7a7adf84 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Sat, 9 May 2026 18:50:10 +0800 Subject: [PATCH] GPU-accelerate Shell-Patch BSSN evolution Phase 1: Enable GPU resident state for Cartesian patches in Shell mode. - Remove WithShell guard from bssn_cuda_use_resident_sync(). - Add GPU-to-CPU state sync before shell CPU consumers (SHStep, CS_Inter, inline shell RHS blocks). Phase 2: GPU-accelerate BSSN Shell Patch RHS. - Create bssn_gpu.h with RHS_SS_PARA macro and gpu_rhs_ss declaration. - Fix compilation bugs in legacy bssn_gpu_rhs_ss.cu (deprecated cudaThreadSynchronize, tmp_con2 redeclaration, ijkmin3_h typo, CUDA_SAFE_CALL, missing compare_result guard). - Add bssn_gpu_rhs_ss.o to CFILES_CUDA_BSSN with build rule. - Write cuda_compute_rhs_bssn_ss() wrapper bridging Fortran and GPU parameter conventions, redirect all shell RHS call sites via #define. Verified: 30-step Shell-Patch GPU run completes without errors/NaN. Step wall time ~4.4s (step_fn ~2.0s + RP ~0.68s + constraint ~0.70s). Co-Authored-By: Claude Opus 4.7 --- AMSS_NCKU_source/bssn_class.C | 138 +++++++++++++++++++++++++++- AMSS_NCKU_source/bssn_gpu.h | 52 +++++++++++ AMSS_NCKU_source/bssn_gpu_rhs_ss.cu | 70 +++++++------- AMSS_NCKU_source/makefile | 20 ++-- 4 files changed, 234 insertions(+), 46 deletions(-) create mode 100644 AMSS_NCKU_source/bssn_gpu.h diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index 3a53e9c..6c0e749 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -27,7 +27,81 @@ using namespace std; #include "bssn_rhs.h" #if USE_CUDA_BSSN #include "bssn_rhs_cuda.h" +#ifdef WithShell +#include "bssn_gpu.h" #endif +#endif + +#if USE_CUDA_BSSN && defined(WithShell) +// GPU-accelerated shell RHS: same parameter signature as f_compute_rhs_bssn_ss. +// Internally calls gpu_rhs_ss with calledby=0, mpi_rank=0 (device 0). +extern "C" { +static int cuda_compute_rhs_bssn_ss( + int *ex, double &T, double *crho, double *sigma, double *R, + double *X, double *Y, double *Z, + double *drhodx, double *drhody, double *drhodz, + double *dsigmadx, double *dsigmady, double *dsigmadz, + double *dRdx, double *dRdy, double *dRdz, + double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, + double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, + double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, + double *chi, double *trK, + double *gxx, double *gxy, double *gxz, double *gyy, double *gyz, double *gzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *rho, double *Sx, double *Sy, double *Sz, + double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, + double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, + int &Symmetry, int &Lev, double &eps, int &sst, int &co) +{ + return gpu_rhs_ss(0, 0, // calledby=ABE_main, mpi_rank=device_0 + ex, T, crho, sigma, R, X, Y, Z, + drhodx, drhody, drhodz, + dsigmadx, dsigmady, dsigmadz, + dRdx, dRdy, dRdz, + drhodxx, drhodxy, drhodxz, drhodyy, drhodyz, drhodzz, + dsigmadxx, dsigmadxy, dsigmadxz, dsigmadyy, dsigmadyz, dsigmadzz, + dRdxx, dRdxy, dRdxz, dRdyy, dRdyz, dRdzz, + chi, trK, + gxx, gxy, gxz, gyy, gyz, gzz, + Axx, Axy, Axz, Ayy, Ayz, Azz, + Gamx, Gamy, Gamz, + Lap, betax, betay, betaz, + dtSfx, dtSfy, dtSfz, + chi_rhs, trK_rhs, + gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs, + Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs, + Gamx_rhs, Gamy_rhs, Gamz_rhs, + Lap_rhs, betax_rhs, betay_rhs, betaz_rhs, + dtSfx_rhs, dtSfy_rhs, dtSfz_rhs, + rho, Sx, Sy, Sz, + Sxx, Sxy, Sxz, Syy, Syz, Szz, + Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz, + Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz, + Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz, + Rxx, Rxy, Rxz, Ryy, Ryz, Rzz, + ham_Res, movx_Res, movy_Res, movz_Res, + Gmx_Res, Gmy_Res, Gmz_Res, + Symmetry, Lev, eps, sst, co); +} +} +// All call sites below that use f_compute_rhs_bssn_ss get redirected to GPU +#define f_compute_rhs_bssn_ss cuda_compute_rhs_bssn_ss +#endif + #include "initial_puncture.h" #include "enforce_algebra.h" #include "rungekutta4_rout.h" @@ -474,12 +548,8 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList *vars, bool bssn_cuda_use_resident_sync(int lev) { -#ifdef WithShell (void)lev; - return false; -#else return true; -#endif } bool bssn_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev) @@ -3464,6 +3534,13 @@ void bssn_class::RecursiveStep(int lev) // RestrictProlong(lev,YN,false,StateList,OldStateList,SynchList_cor); #ifdef WithShell +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif if (lev == 0) { clock_t prev_clock, curr_clock; @@ -3622,6 +3699,16 @@ void bssn_class::ParallelStep() #endif #ifdef WithShell +#if USE_CUDA_BSSN + { + const int lev0 = 0; + if (bssn_cuda_use_resident_sync(lev0)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } + } +#endif SHStep(); #if (RPS == 1) { @@ -3976,6 +4063,13 @@ void bssn_class::ParallelStep() } #ifdef WithShell +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif SHStep(); // a_stream.clear(); // a_stream.str(""); @@ -4427,6 +4521,13 @@ void bssn_class::Step(int lev, int YN) // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif // evolve Shell Patches if (lev == 0) { @@ -4878,6 +4979,13 @@ void bssn_class::Step(int lev, int YN) // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif // evolve Shell Patches if (lev == 0) { @@ -5398,6 +5506,13 @@ void bssn_class::Step(int lev, int YN) // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif // evolve Shell Patches if (lev == 0) { @@ -5750,6 +5865,13 @@ void bssn_class::Step(int lev, int YN) // NOTE: error check deferred to after Shell Patch computation to reduce MPI_Allreduce calls #ifdef WithShell +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif // evolve Shell Patches if (lev == 0) { @@ -6673,6 +6795,14 @@ void bssn_class::SHStep() // misc::tillherecheck(GH->Commlev[lev],GH->start_rank[lev],"start Step"); // #endif +#if USE_CUDA_BSSN + if (bssn_cuda_use_resident_sync(lev)) + { + for (int dl = 0; dl < GH->levels; dl++) + bssn_cuda_download_level_state_if_present(GH->PatL[dl], StateList, myrank); + } +#endif + setpbh(BH_num, Porg0, Mass, BH_num_input); double dT_lev = dT * pow(0.5, Mymax(lev, trfls)); diff --git a/AMSS_NCKU_source/bssn_gpu.h b/AMSS_NCKU_source/bssn_gpu.h new file mode 100644 index 0000000..97d7c74 --- /dev/null +++ b/AMSS_NCKU_source/bssn_gpu.h @@ -0,0 +1,52 @@ + +#ifndef BSSN_GPU_H_ +#define BSSN_GPU_H_ +#include "bssn_macro.h" +#include "macrodef.fh" + +#define DEVICE_ID 0 +// #define DEVICE_ID_BY_MPI_RANK +#define GRID_DIM 256 +#define BLOCK_DIM 128 + +#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]] +#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]] +#define pow2(x) ((x) * (x)) +#define TimeBetween(a, b) ((b.tv_sec - a.tv_sec) + (b.tv_usec - a.tv_usec) / 1000000.0f) +#define M_ metac. +#define Mh_ meta-> +#define Ms_ metassc. +#define Msh_ metass-> + +// #define TIMING + +#define RHS_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co + +/** main function */ +int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T, + double *X, double *Y, double *Z, + double *chi, double *trK, + double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, + double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, + double *Gamx, double *Gamy, double *Gamz, + double *Lap, double *betax, double *betay, double *betaz, + double *dtSfx, double *dtSfy, double *dtSfz, + double *chi_rhs, double *trK_rhs, + double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, + double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, + double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, + double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, + double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, + double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, + double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, + double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, + double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, + double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, + double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, + double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, + double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, + int &Symmetry, int &Lev, double &eps, int &co); + +int gpu_rhs_ss(RHS_SS_PARA); + +#endif diff --git a/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu b/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu index 11530ae..dac3ba1 100644 --- a/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu +++ b/AMSS_NCKU_source/bssn_gpu_rhs_ss.cu @@ -20,12 +20,14 @@ using namespace std; __device__ volatile unsigned int global_count = 0; +#ifdef RESULT_CHECK void compare_result_gpu(int ftag1,double * datac,int data_num){ double * data = (double*)malloc(sizeof(double)*data_num); cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost); compare_result(ftag1,data,data_num); free(data); } +#endif __global__ void sub_symmetry_bd_ss_partF(int ord, double * func, double *funcc) { @@ -153,11 +155,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){ sub_symmetry_bd_ss_partF<<>>(ord,func,funcc); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_ss_partI<<>>(ord,func,funcc,SoA[0]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_symmetry_bd_ss_partJ<<>>(ord,func,funcc,SoA[1]); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){ @@ -247,13 +249,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f //cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double)); //cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double)); sub_symmetry_bd_ss(2,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(0,fh,h_3D_SIZE[2]); sub_fderivs_sh<<>>(fh,Msh_ gx,Msh_ gy,Msh_ gz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_shc_part1<<>>(fx,fy,fz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(1,fx,h_3D_SIZE[0]); //compare_result_gpu(2,fy,h_3D_SIZE[0]); //compare_result_gpu(3,fz,h_3D_SIZE[0]); @@ -451,17 +453,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh, //fderivs_sh sub_symmetry_bd_ss(2,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(1,fh,h_3D_SIZE[2]); sub_fderivs_sh<<>>(fh,Msh_ gx,Msh_ gy,Msh_ gz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //fdderivs_sh sub_symmetry_bd_ss(2,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(21,fh,h_3D_SIZE[2]); sub_fdderivs_sh<<>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); /*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]); compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]); compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]); @@ -472,7 +474,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh, compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]); compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/ sub_fdderivs_shc_part1<<>>(fxx,fxy,fxz,fyy,fyz,fzz); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); /*compare_result_gpu(1,fxx,h_3D_SIZE[0]); compare_result_gpu(2,fxy,h_3D_SIZE[0]); compare_result_gpu(3,fxz,h_3D_SIZE[0]); @@ -496,9 +498,9 @@ __global__ void computeRicci_ss_part1(double * dst) inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta) { sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); computeRicci_ss_part1<<>>(dst); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_lopsided_ss_part1(double * dst) @@ -516,9 +518,9 @@ __global__ void sub_lopsided_ss_part1(double * dst) inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA) { sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_lopsided_ss_part1<<>>(dst); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } __global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs) @@ -590,11 +592,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA } //compare_result_gpu(10,f,h_3D_SIZE[0]); sub_symmetry_bd_ss(3,f,fh,SoA1); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(0,fh,h_3D_SIZE[3]); sub_kodis_sh_part1<<>>(f,fh,f_rhs); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); //compare_result_gpu(1,f_rhs,h_3D_SIZE[0]); } @@ -1699,7 +1701,7 @@ void destroy_meta(Meta *meta,Metass *metass) if(Msh_ gzz) cudaFree(Msh_ gzz); #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5 || GAUGE == 6 || GAUGE == 7) - if(Mh_ reta) CUDA_SAFE_CALL(cudaFree(Mh_ reta)); + if(Mh_ reta) cudaFree(Mh_ reta); #endif @@ -1895,7 +1897,7 @@ int gpu_rhs_ss(RHS_SS_PARA) //1.2 local Data cudaMalloc((void**)&(Mh_ gxx), matrix_size * sizeof(double)); - CUDA_SAFE_CALL( cudaMalloc((void**)&(Mh_ gyy), matrix_size * sizeof(double))); + cudaMalloc((void**)&(Mh_ gyy), matrix_size * sizeof(double)); cudaMalloc((void**)&(Mh_ gzz), matrix_size * sizeof(double)); cudaMalloc((void**)&(Mh_ chix), matrix_size * sizeof(double)); cudaMalloc((void**)&(Mh_ chiy), matrix_size * sizeof(double)); @@ -2160,7 +2162,7 @@ int gpu_rhs_ss(RHS_SS_PARA) double tmp_con2 = 1/Mass[0] - tmp_con; cudaMemcpyToSymbol(C1, &tmp_con2, sizeof(double)); - double tmp_con2 = 1/Mass[1] - tmp_con; + tmp_con2 = 1/Mass[1] - tmp_con; cudaMemcpyToSymbol(C2, &tmp_con2, sizeof(double)); @@ -2233,7 +2235,7 @@ int gpu_rhs_ss(RHS_SS_PARA) if((sst == 2 || sst == 4) && abs[1] < dYh) { ijkmin_h[1] = -2; - ijkmin_h[1] = -3; + ijkmin3_h[1] = -3; } if((sst == 3 || sst == 5) && abs_Y_ex2 < dYh) { @@ -2287,13 +2289,13 @@ int gpu_rhs_ss(RHS_SS_PARA) #ifdef TIMING1 - cudaThreadSynchronize(); + cudaDeviceSynchronize(); gettimeofday(&tv2, NULL); cout<<"TIME USED"<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass); sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas); @@ -2322,7 +2324,7 @@ int gpu_rhs_ss(RHS_SS_PARA) sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa); compute_rhs_ss_part2<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass); sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas); @@ -2332,7 +2334,7 @@ int gpu_rhs_ss(RHS_SS_PARA) sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa); compute_rhs_ss_part3<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta); computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta); @@ -2340,25 +2342,25 @@ int gpu_rhs_ss(RHS_SS_PARA) computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta); computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta); computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); compute_rhs_ss_part4<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); - //cudaThreadSynchronize(); + //cudaDeviceSynchronize(); //compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]); //compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]); //compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]); compute_rhs_ss_part5<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss); compute_rhs_ss_part6<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); #if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5) sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss); @@ -2423,7 +2425,7 @@ int gpu_rhs_ss(RHS_SS_PARA) } if(co == 0){ compute_rhs_ss_part7<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss); sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas); @@ -2432,7 +2434,7 @@ int gpu_rhs_ss(RHS_SS_PARA) sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa); sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss); compute_rhs_ss_part8<<>>(); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } #if (ABV == 1) @@ -2512,7 +2514,7 @@ int gpu_rhs_ss(RHS_SS_PARA) //test kodis //sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss); #ifdef TIMING - cudaThreadSynchronize(); + cudaDeviceSynchronize(); gettimeofday(&tv2, NULL); cout<<"MPI rank is: "<