Compare commits
7 Commits
chb-copilo
...
chb-local
| Author | SHA1 | Date | |
|---|---|---|---|
|
95575d9450
|
|||
|
54600327da
|
|||
|
|
75be0968fc | ||
|
|
b27e071cde | ||
| a1125d4c79 | |||
| dcc66588fc | |||
| 950d448edf |
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
__pycache__
|
||||
GW150914
|
||||
|
||||
@@ -16,12 +16,12 @@ import numpy
|
||||
File_directory = "GW150914" ## output file directory
|
||||
Output_directory = "binary_output" ## binary data file directory
|
||||
## The file directory name should not be too long
|
||||
MPI_processes = 64 ## number of mpi processes used in the simulation
|
||||
MPI_processes = 96 ## number of mpi processes used in the simulation
|
||||
|
||||
GPU_Calculation = "no" ## Use GPU or not
|
||||
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
|
||||
CPU_Part = 1.0
|
||||
GPU_Part = 0.0
|
||||
GPU_Calculation = "yes" ## Use GPU or not
|
||||
## GPU support has been updated for CUDA 13
|
||||
CPU_Part = 0.0
|
||||
GPU_Part = 1.0
|
||||
|
||||
#################################################
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ using namespace std;
|
||||
#include <fstream>
|
||||
#endif
|
||||
|
||||
void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||
static void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||
double * data = (double*)malloc(sizeof(double)*data_num);
|
||||
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
compare_result(ftag1,data,data_num);
|
||||
@@ -83,7 +83,7 @@ inline void sub_enforce_ga(int matrix_size){
|
||||
double * trA = M_ chin1;
|
||||
enforce_ga<<<GRID_DIM,BLOCK_DIM>>>(trA);
|
||||
cudaMemset(trA,0,matrix_size * sizeof(double));
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
//cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double));
|
||||
//trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz
|
||||
@@ -273,13 +273,13 @@ __global__ void sub_symmetry_bd_partK(int ord,double * func, double * funcc,doub
|
||||
#endif //ifdef Vertex
|
||||
inline void sub_symmetry_bd(int ord,double * func, double * funcc,double * SoA){
|
||||
sub_symmetry_bd_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_symmetry_bd_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_symmetry_bd_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_symmetry_bd_partK<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[2]);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
|
||||
@@ -378,9 +378,9 @@ inline void sub_fdderivs(double * f,double *fh,double *fxx,double *fxy,double *f
|
||||
cudaMemset(fyy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fyz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fzz,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_fdderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fxx,fxy,fxz,fyy,fyz,fzz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void sub_fderivs_part1(double * f,double * fh,double *fx,double *fy,double *fz )
|
||||
@@ -445,9 +445,9 @@ inline void sub_fderivs(double * f,double * fh,double *fx,double *fy,double *fz,
|
||||
cudaMemset(fy,0,_3D_SIZE[0] * sizeof(double));
|
||||
cudaMemset(fz,0,_3D_SIZE[0] * sizeof(double));
|
||||
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_fderivs_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,fx,fy,fz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void computeRicci_part1(double * dst)
|
||||
@@ -465,9 +465,9 @@ __global__ void computeRicci_part1(double * dst)
|
||||
inline void computeRicci(double * src,double* dst,double * SoA, Meta* meta)
|
||||
{
|
||||
sub_fdderivs(src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
computeRicci_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
}/*Exception*/
|
||||
|
||||
@@ -524,9 +524,9 @@ __global__ void sub_kodis_part1(double *f,double *fh,double *f_rhs)
|
||||
inline void sub_kodis(double *f,double *fh,double *f_rhs,double *SoA)
|
||||
{
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_kodis_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz)
|
||||
@@ -617,9 +617,9 @@ __global__ void sub_lopsided_part1(double *f,double* fh,double *f_rhs,double *S
|
||||
|
||||
inline void sub_lopsided(double *f,double*fh,double *f_rhs,double *Sfx,double *Sfy,double *Sfz,double *SoA){
|
||||
sub_symmetry_bd(3,f,fh,SoA);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_lopsided_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs,Sfx,Sfy,Sfz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void compute_rhs_bssn_part1()
|
||||
@@ -2656,13 +2656,13 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
|
||||
|
||||
#ifdef TIMING1
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
gettimeofday(&tv2, NULL);
|
||||
cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl;
|
||||
#endif
|
||||
//cout<<"GPU meta data ready.\n";
|
||||
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
//--------------test constant memory address & value--------------
|
||||
/* double rank = mpi_rank;
|
||||
@@ -2685,7 +2685,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
//sub_enforce_ga(matrix_size);
|
||||
//4.1-----compute rhs---------
|
||||
compute_rhs_bssn_part1<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fderivs(Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
|
||||
sub_fderivs(Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
|
||||
@@ -2701,7 +2701,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs(Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
|
||||
|
||||
compute_rhs_bssn_part2<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fdderivs(Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
|
||||
sub_fdderivs(Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
|
||||
@@ -2711,7 +2711,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs( Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
|
||||
|
||||
compute_rhs_bssn_part3<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
computeRicci(Mh_ dxx,Mh_ Rxx,sss, meta);
|
||||
computeRicci(Mh_ dyy,Mh_ Ryy,sss, meta);
|
||||
@@ -2720,20 +2720,20 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
computeRicci(Mh_ gxz,Mh_ Rxz,asa, meta);
|
||||
computeRicci(Mh_ gyz,Mh_ Ryz,saa, meta);
|
||||
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
compute_rhs_bssn_part4<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fdderivs(Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part5<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fdderivs(Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_bssn_part6<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
|
||||
sub_fderivs(Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
|
||||
@@ -2805,7 +2805,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
|
||||
if(co == 0){
|
||||
compute_rhs_bssn_part7<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
|
||||
sub_fderivs(Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
|
||||
@@ -2814,7 +2814,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
sub_fderivs(Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
|
||||
sub_fderivs(Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
|
||||
compute_rhs_bssn_part8<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
#if (ABV == 1)
|
||||
@@ -2895,7 +2895,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y,
|
||||
//-------------------FOR GPU TEST----------------------
|
||||
//-----------------------------------------------------
|
||||
#ifdef TIMING
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
gettimeofday(&tv2, NULL);
|
||||
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,17 @@
|
||||
#include "bssn_macro.h"
|
||||
#include "macrodef.fh"
|
||||
|
||||
// CUDA error checking macro for CUDA 13 compatibility
|
||||
#define CUDA_SAFE_CALL(call) \
|
||||
do { \
|
||||
cudaError_t err = call; \
|
||||
if (err != cudaSuccess) { \
|
||||
fprintf(stderr, "CUDA error in %s:%d: %s\n", __FILE__, __LINE__, \
|
||||
cudaGetErrorString(err)); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DEVICE_ID 0
|
||||
// #define DEVICE_ID_BY_MPI_RANK
|
||||
#define GRID_DIM 256
|
||||
|
||||
@@ -2134,7 +2134,9 @@ void bssn_class::Evolve(int Steps)
|
||||
|
||||
CheckPoint->write_Black_Hole_position(BH_num_input, BH_num, Porg0, Porgbr, Mass);
|
||||
CheckPoint->writecheck_cgh(PhysTime, GH);
|
||||
#ifdef WithShell
|
||||
CheckPoint->writecheck_sh(PhysTime, SH);
|
||||
#endif
|
||||
CheckPoint->write_bssn(LastDump, Last2dDump, LastAnas);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@ using namespace std;
|
||||
|
||||
__device__ volatile unsigned int global_count = 0;
|
||||
|
||||
void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||
static void compare_result_gpu(int ftag1,double * datac,int data_num){
|
||||
double * data = (double*)malloc(sizeof(double)*data_num);
|
||||
cudaMemcpy(data, datac, data_num * sizeof(double), cudaMemcpyDeviceToHost);
|
||||
compare_result(ftag1,data,data_num);
|
||||
@@ -153,11 +153,11 @@ __global__ void sub_symmetry_bd_ss_partJ(int ord,double * func, double * funcc,d
|
||||
|
||||
inline void sub_symmetry_bd_ss(int ord,double * func, double * funcc,double * SoA){
|
||||
sub_symmetry_bd_ss_partF<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_symmetry_bd_ss_partI<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[0]);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_symmetry_bd_ss_partJ<<<GRID_DIM,BLOCK_DIM>>>(ord,func,funcc,SoA[1]);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void sub_fderivs_shc_part1(double *fx,double *fy,double *fz){
|
||||
@@ -247,13 +247,13 @@ inline void sub_fderivs_shc(int& sst,double * f,double * fh,double *fx,double *f
|
||||
//cudaMemset(Msh_ gy,0,h_3D_SIZE[0] * sizeof(double));
|
||||
//cudaMemset(Msh_ gz,0,h_3D_SIZE[0] * sizeof(double));
|
||||
sub_symmetry_bd_ss(2,f,fh,SoA1);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
//compare_result_gpu(0,fh,h_3D_SIZE[2]);
|
||||
sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fx,fy,fz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
//compare_result_gpu(1,fx,h_3D_SIZE[0]);
|
||||
//compare_result_gpu(2,fy,h_3D_SIZE[0]);
|
||||
//compare_result_gpu(3,fz,h_3D_SIZE[0]);
|
||||
@@ -451,17 +451,17 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
|
||||
|
||||
//fderivs_sh
|
||||
sub_symmetry_bd_ss(2,f,fh,SoA1);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
//compare_result_gpu(1,fh,h_3D_SIZE[2]);
|
||||
sub_fderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gx,Msh_ gy,Msh_ gz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
//fdderivs_sh
|
||||
sub_symmetry_bd_ss(2,f,fh,SoA1);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
//compare_result_gpu(21,fh,h_3D_SIZE[2]);
|
||||
sub_fdderivs_sh<<<GRID_DIM,BLOCK_DIM>>>(fh,Msh_ gxx,Msh_ gxy,Msh_ gxz,Msh_ gyy,Msh_ gyz,Msh_ gzz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
/*compare_result_gpu(11,Msh_ gx,h_3D_SIZE[0]);
|
||||
compare_result_gpu(12,Msh_ gy,h_3D_SIZE[0]);
|
||||
compare_result_gpu(13,Msh_ gz,h_3D_SIZE[0]);
|
||||
@@ -472,7 +472,7 @@ inline void sub_fdderivs_shc(int& sst,double * f,double * fh,
|
||||
compare_result_gpu(5,Msh_ gyz,h_3D_SIZE[0]);
|
||||
compare_result_gpu(6,Msh_ gzz,h_3D_SIZE[0]);*/
|
||||
sub_fdderivs_shc_part1<<<GRID_DIM,BLOCK_DIM>>>(fxx,fxy,fxz,fyy,fyz,fzz);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
/*compare_result_gpu(1,fxx,h_3D_SIZE[0]);
|
||||
compare_result_gpu(2,fxy,h_3D_SIZE[0]);
|
||||
compare_result_gpu(3,fxz,h_3D_SIZE[0]);
|
||||
@@ -496,9 +496,9 @@ __global__ void computeRicci_ss_part1(double * dst)
|
||||
inline void computeRicci_ss(int &sst,double * src,double* dst,double * SoA, Meta* meta)
|
||||
{
|
||||
sub_fdderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,SoA);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
computeRicci_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
}
|
||||
__global__ void sub_lopsided_ss_part1(double * dst)
|
||||
@@ -516,9 +516,9 @@ __global__ void sub_lopsided_ss_part1(double * dst)
|
||||
inline void sub_lopsided_ss(int& sst,double *src,double* dst,double *SoA)
|
||||
{
|
||||
sub_fderivs_shc(sst,src,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,SoA);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
sub_lopsided_ss_part1<<<GRID_DIM,BLOCK_DIM>>>(dst);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
__global__ void sub_kodis_sh_part1(double *f,double *fh,double *f_rhs)
|
||||
@@ -590,11 +590,11 @@ inline void sub_kodis_ss(int &sst,double *f,double *fh,double *f_rhs,double *SoA
|
||||
}
|
||||
//compare_result_gpu(10,f,h_3D_SIZE[0]);
|
||||
sub_symmetry_bd_ss(3,f,fh,SoA1);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
//compare_result_gpu(0,fh,h_3D_SIZE[3]);
|
||||
|
||||
sub_kodis_sh_part1<<<GRID_DIM,BLOCK_DIM>>>(f,fh,f_rhs);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
//compare_result_gpu(1,f_rhs,h_3D_SIZE[0]);
|
||||
}
|
||||
|
||||
@@ -2287,13 +2287,13 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
|
||||
|
||||
#ifdef TIMING1
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
gettimeofday(&tv2, NULL);
|
||||
cout<<"TIME USED"<<TimeBetween(tv1, tv2)<<endl;
|
||||
#endif
|
||||
//cout<<"GPU meta data ready.\n";
|
||||
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
|
||||
//-------------get device info-------------------------------------
|
||||
@@ -2306,7 +2306,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
//sub_enforce_ga(matrix_size);
|
||||
//4.1-----compute rhs---------
|
||||
compute_rhs_ss_part1<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ betaxx,Mh_ betaxy,Mh_ betaxz,ass);
|
||||
sub_fderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ betayx,Mh_ betayy,Mh_ betayz,sas);
|
||||
@@ -2322,7 +2322,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
sub_fderivs_shc(sst,Mh_ gyz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz, saa);
|
||||
|
||||
compute_rhs_ss_part2<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fdderivs_shc(sst,Mh_ betax,Mh_ fh,Mh_ gxxx,Mh_ gxyx,Mh_ gxzx,Mh_ gyyx,Mh_ gyzx,Mh_ gzzx,ass);
|
||||
sub_fdderivs_shc(sst,Mh_ betay,Mh_ fh,Mh_ gxxy,Mh_ gxyy,Mh_ gxzy,Mh_ gyyy,Mh_ gyzy,Mh_ gzzy,sas);
|
||||
@@ -2332,7 +2332,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
sub_fderivs_shc( sst,Mh_ Gamz, Mh_ fh,Mh_ Gamzx, Mh_ Gamzy, Mh_ Gamzz,ssa);
|
||||
|
||||
compute_rhs_ss_part3<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
computeRicci_ss(sst,Mh_ dxx,Mh_ Rxx,sss, meta);
|
||||
computeRicci_ss(sst,Mh_ dyy,Mh_ Ryy,sss, meta);
|
||||
@@ -2340,25 +2340,25 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
computeRicci_ss(sst,Mh_ gxy,Mh_ Rxy,aas, meta);
|
||||
computeRicci_ss(sst,Mh_ gxz,Mh_ Rxz,asa, meta);
|
||||
computeRicci_ss(sst,Mh_ gyz,Mh_ Ryz,saa, meta);
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
compute_rhs_ss_part4<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fdderivs_shc(sst,Mh_ chi,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
//cudaThreadSynchronize();
|
||||
//cudaDeviceSynchronize();
|
||||
//compare_result_gpu(0,Mh_ chi,h_3D_SIZE[0]);
|
||||
//compare_result_gpu(1,Mh_ chi,h_3D_SIZE[0]);
|
||||
//compare_result_gpu(2,Mh_ fyz,h_3D_SIZE[0]);
|
||||
|
||||
compute_rhs_ss_part5<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fdderivs_shc(sst,Mh_ Lap,Mh_ fh,Mh_ fxx,Mh_ fxy,Mh_ fxz,Mh_ fyy,Mh_ fyz,Mh_ fzz,sss);
|
||||
|
||||
compute_rhs_ss_part6<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
#if (GAUGE == 2 || GAUGE == 3 || GAUGE == 4 || GAUGE == 5)
|
||||
sub_fderivs_shc(sst,Mh_ chi,Mh_ fh, Mh_ dtSfx_rhs, Mh_ dtSfy_rhs, Mh_ dtSfz_rhs,sss);
|
||||
@@ -2423,7 +2423,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
}
|
||||
if(co == 0){
|
||||
compute_rhs_ss_part7<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
sub_fderivs_shc(sst,Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);
|
||||
sub_fderivs_shc(sst,Mh_ Axy,Mh_ fh,Mh_ gxyx,Mh_ gxyy,Mh_ gxyz,aas);
|
||||
@@ -2432,7 +2432,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
sub_fderivs_shc(sst,Mh_ Ayz,Mh_ fh,Mh_ gyzx,Mh_ gyzy,Mh_ gyzz,saa);
|
||||
sub_fderivs_shc(sst,Mh_ Azz,Mh_ fh,Mh_ gzzx,Mh_ gzzy,Mh_ gzzz,sss);
|
||||
compute_rhs_ss_part8<<<GRID_DIM,BLOCK_DIM>>>();
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
}
|
||||
|
||||
#if (ABV == 1)
|
||||
@@ -2512,7 +2512,7 @@ int gpu_rhs_ss(RHS_SS_PARA)
|
||||
//test kodis
|
||||
//sub_kodis_sh(sst,Msh_ drhodx,Mh_ fh2,Msh_ drhody,sss);
|
||||
#ifdef TIMING
|
||||
cudaThreadSynchronize();
|
||||
cudaDeviceSynchronize();
|
||||
gettimeofday(&tv2, NULL);
|
||||
cout<<"MPI rank is: "<<mpi_rank<<" GPU TIME is"<<TimeBetween(tv1, tv2)<<" (s)."<<endl;
|
||||
#endif
|
||||
|
||||
@@ -1676,8 +1676,11 @@ void bssn_class::Step_GPU(int lev, int YN)
|
||||
#endif // PSTR == ?
|
||||
|
||||
//--------------------------With Shell--------------------------
|
||||
// Note: SHStep() implementation is in bssn_gpu_class.C
|
||||
|
||||
#ifdef WithShell
|
||||
#if 0
|
||||
// This SHStep() implementation has been moved to bssn_gpu_class.C to avoid duplicate definition
|
||||
void bssn_class::SHStep()
|
||||
{
|
||||
int lev = 0;
|
||||
@@ -1938,5 +1941,5 @@ void bssn_class::SHStep()
|
||||
sPp = sPp->next;
|
||||
}
|
||||
}
|
||||
d
|
||||
#endif // #if 0
|
||||
#endif // withshell
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
|
||||
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
|
||||
filein = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/14/ -I/usr/include/c++/14/
|
||||
|
||||
filein = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include
|
||||
##filein = -I/usr/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -I/usr/lib/cuda/include
|
||||
|
||||
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -lmpich -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran
|
||||
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran
|
||||
LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/14 -lgfortran -lmpi -lgfortran -lcudart -lcuda
|
||||
##LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -L/usr/lib/cuda/lib64 -lcudart -lmpi -lgfortran
|
||||
|
||||
CXXAPPFLAGS = -O3 -Wno-deprecated -Dfortran3 -Dnewc
|
||||
#f90appflags = -O3 -fpp
|
||||
@@ -18,4 +18,5 @@ CLINKER = mpic++
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||
# RTX 4050 uses Ada Lovelace architecture (compute capability 8.9)
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch=sm_89 -Dfortran3 -Dnewc
|
||||
|
||||
Reference in New Issue
Block a user