diff --git a/AMSS_NCKU_source/bssn_cuda_ops.cu b/AMSS_NCKU_source/bssn_cuda_ops.cu index 3654959..7101227 100644 --- a/AMSS_NCKU_source/bssn_cuda_ops.cu +++ b/AMSS_NCKU_source/bssn_cuda_ops.cu @@ -461,20 +461,21 @@ int bssn_cuda_enforce_ga(int *ex, if (ok) { - cudaError_t err = cudaMemcpy(dxx, cache.dxx.ptr, bytes, cudaMemcpyDeviceToHost); - if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dxx", err); - ok = err == cudaSuccess; - if (ok) { err = cudaMemcpy(gxy, cache.gxy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(gxz, cache.gxz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gxz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(dyy, cache.dyy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dyy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(gyz, cache.gyz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) gyz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(dzz, cache.dzz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) dzz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Axx, cache.Axx.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axx", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Axy, cache.Axy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Axz, cache.Axz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Axz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Ayy, cache.Ayy.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayy", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Ayz, cache.Ayz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Ayz", err); ok = err == cudaSuccess; } - if (ok) { err = cudaMemcpy(Azz, cache.Azz.ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) Azz", err); ok = err == cudaSuccess; } + // The next GPU RHS stage consumes these fields immediately. + // Keep them device-resident and expose the mapping so gpu_rhs() can + // reuse them via D2D copies instead of forcing an intermediate D2H round-trip. + bssn_gpu_register_device_buffer(dxx, cache.dxx.ptr); + bssn_gpu_register_device_buffer(gxy, cache.gxy.ptr); + bssn_gpu_register_device_buffer(gxz, cache.gxz.ptr); + bssn_gpu_register_device_buffer(dyy, cache.dyy.ptr); + bssn_gpu_register_device_buffer(gyz, cache.gyz.ptr); + bssn_gpu_register_device_buffer(dzz, cache.dzz.ptr); + bssn_gpu_register_device_buffer(Axx, cache.Axx.ptr); + bssn_gpu_register_device_buffer(Axy, cache.Axy.ptr); + bssn_gpu_register_device_buffer(Axz, cache.Axz.ptr); + bssn_gpu_register_device_buffer(Ayy, cache.Ayy.ptr); + bssn_gpu_register_device_buffer(Ayz, cache.Ayz.ptr); + bssn_gpu_register_device_buffer(Azz, cache.Azz.ptr); } return ok ? 0 : 1; @@ -549,15 +550,31 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, (rk_stage == 0) || cache.host_state0 != state0 || cache.nx != nx || cache.ny != ny || cache.nz != nz; const bool refresh_rhs = (rk_stage == 0) || !cache.rhs_resident || cache.host_rhs != rhs_accum; + double *stage_ptr = nullptr; + const double *mapped_stage_ptr = need_stage_input ? bssn_gpu_find_device_buffer(stage_data) : nullptr; ok = ok && (!refresh_state0 || copy_to_device_preferring_device(cache.state0, state0, bytes)) && (!need_boundary_input || copy_to_device(cache.boundary, boundary_src, bytes)) && - (!need_stage_input || copy_to_device_preferring_device(cache.stage, stage_data, bytes)) && (!refresh_rhs || copy_to_device_preferring_device(cache.rhs, rhs_accum, bytes)); - if (ok && !need_stage_input) + if (ok && need_stage_input) + { + if (mapped_stage_ptr) + { + stage_ptr = const_cast(mapped_stage_ptr); + } + else + { + ok = copy_to_device_preferring_device(cache.stage, stage_data, bytes); + stage_ptr = cache.stage.ptr; + } + } + else if (ok) + { ok = ensure_capacity(cache.stage, bytes); + stage_ptr = cache.stage.ptr; + } if (!ok) return 1; @@ -599,8 +616,8 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, double *d_X = cache.X.ptr, *d_Y = cache.Y.ptr, *d_Z = cache.Z.ptr; double *d_state0 = cache.state0.ptr, *d_boundary = cache.boundary.ptr; - double *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; - double *bam_target = (rk_stage == 0) ? d_rhs : d_stage; + double *d_rhs = cache.rhs.ptr; + double *bam_target = (rk_stage == 0) ? d_rhs : stage_ptr; const double *bam_source = (rk_stage == 0) ? d_state0 : d_boundary; void *args[] = {&nx, &ny, &nz, &d_X, &d_Y, &d_Z, &xmin, &ymin, &zmin, &xmax, &ymax, &zmax, @@ -615,14 +632,14 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (ok) { - double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr, *d_rhs = cache.rhs.ptr; + double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr, *d_rhs = cache.rhs.ptr; void *args[] = {&n, &dT, &d_state0, &d_stage, &d_rhs, &rk_stage}; ok = launch_kernel(grid, block, (const void *)rk4_kernel, args); } if (ok && lev > 0) { - double *d_state0 = cache.state0.ptr, *d_stage = cache.stage.ptr; + double *d_state0 = cache.state0.ptr, *d_stage = stage_ptr; void *args[] = {&nx, &ny, &nz, &has_xmin, &has_ymin, &has_zmin, &has_xmax, &has_ymax, &has_zmax, @@ -632,9 +649,9 @@ int bssn_cuda_rk4_boundary_var(int *ex, double dT, if (ok) { - bssn_gpu_register_device_buffer(stage_data, cache.stage.ptr); + bssn_gpu_register_device_buffer(stage_data, stage_ptr); - cudaError_t err = cudaMemcpy(stage_data, cache.stage.ptr, bytes, cudaMemcpyDeviceToHost); + cudaError_t err = cudaMemcpy(stage_data, stage_ptr, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) stage_data", err); ok = err == cudaSuccess; } @@ -651,18 +668,30 @@ int bssn_cuda_lowerbound(int *ex, double *chi, double tinny) dim3 block(256); dim3 grid(div_up(n, static_cast(block.x))); - bool ok = copy_to_device_preferring_device(d_chi, chi, bytes); + double *device_chi = nullptr; + const double *mapped = bssn_gpu_find_device_buffer(chi); + bool ok = true; + if (mapped) + { + device_chi = const_cast(mapped); + } + else + { + ok = copy_to_device_preferring_device(d_chi, chi, bytes); + device_chi = d_chi.ptr; + } if (ok) { - double *ptr = d_chi.ptr; + double *ptr = device_chi; void *args[] = {&n, &ptr, &tinny}; ok = launch_kernel(grid, block, (const void *)lowerbound_kernel, args); } if (ok) { - cudaError_t err = cudaMemcpy(chi, d_chi.ptr, bytes, cudaMemcpyDeviceToHost); + bssn_gpu_register_device_buffer(chi, device_chi); + cudaError_t err = cudaMemcpy(chi, device_chi, bytes, cudaMemcpyDeviceToHost); if (err != cudaSuccess) report_cuda_error("cudaMemcpy(D2H) chi", err); ok = err == cudaSuccess; } diff --git a/AMSS_NCKU_source/bssn_cuda_step.C b/AMSS_NCKU_source/bssn_cuda_step.C index 95b1879..6b25691 100644 --- a/AMSS_NCKU_source/bssn_cuda_step.C +++ b/AMSS_NCKU_source/bssn_cuda_step.C @@ -110,21 +110,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - if (bssn_cuda_enforce_ga(cg->shape, - cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], - cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn], - cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], - cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn])) - { - cerr << "GPU enforce_ga failure: lev=" << lev - << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } -#endif - if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_FIRST_TIME)) ERROR = 1; @@ -209,37 +194,6 @@ void bssn_class::Step_MainPath_GPU(int lev, int YN) Block *cg = BP->data; if (myrank == cg->rank) { -#if (AGM == 0) - if (bssn_cuda_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn])) - { - cerr << "GPU enforce_ga failure: lev=" << lev - << " rk_stage=" << iter_count - << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } -#elif (AGM == 1) - if (iter_count == 3 && - bssn_cuda_enforce_ga(cg->shape, - cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], - cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn], - cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], - cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn])) - { - cerr << "GPU enforce_ga failure: lev=" << lev - << " rk_stage=" << iter_count - << " bbox=(" << cg->bbox[0] << ":" << cg->bbox[3] << "," - << cg->bbox[1] << ":" << cg->bbox[4] << "," - << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; - ERROR = 1; - } -#endif - if (gpu_rhs(CALLED_BY_STEP, myrank, RHS_PARA_CALLED_THEN)) ERROR = 1; diff --git a/AMSS_NCKU_source/bssn_gpu.cu b/AMSS_NCKU_source/bssn_gpu.cu index 72aecdf..e522d48 100644 --- a/AMSS_NCKU_source/bssn_gpu.cu +++ b/AMSS_NCKU_source/bssn_gpu.cu @@ -720,16 +720,14 @@ __global__ void enforce_ga(double * trA){ } } -inline void sub_enforce_ga(int matrix_size){ - double * trA = M_ chin1; - enforce_ga<<>>(trA); - cudaMemset(trA,0,matrix_size * sizeof(double)); - cudaThreadSynchronize(); - - //cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double)); - //trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz - -} +inline void sub_enforce_ga(double *trA, int matrix_size){ + enforce_ga<<>>(trA); + cudaMemset(trA,0,matrix_size * sizeof(double)); + + //cudaMemset(Mh_ gupxx,0,matrix_size * sizeof(double)); + //trA gxx,gyy,gzz gupxx,gupxy,gupxz,gupyy,gupyz,gupzz + +} __device__ volatile unsigned int global_count = 0; __global__ void test_init_matrix(){ int tid = blockIdx.x*blockDim.x+threadIdx.x; @@ -2609,7 +2607,7 @@ void destroy_meta(Meta *meta) }*/ -int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, double *Z, +int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, double *Z, double *chi, double * trK , double *dxx , double * gxy ,double *gxz ,double * dyy,double *gyz,double *dzz, double *Axx , double *Axy , double * Axz , double * Ayy , double * Ayz , double * Azz, @@ -2652,6 +2650,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, if (!prepare_gpu_rhs_cache(cache, device, ex)) return 1; Meta * meta = &cache.meta; + const int effective_co = (calledby == CALLED_BY_STEP) ? 1 : co; /* //#1--------------------init_gpu_meta(meta,matrix_size)--------------------------- @@ -3067,74 +3066,6 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, {Mh_ Sx, static_cast(matrix_size)}, {Mh_ Sy, static_cast(matrix_size)}, {Mh_ Sz, static_cast(matrix_size)}, - {Mh_ gxx, static_cast(matrix_size)}, - {Mh_ gyy, static_cast(matrix_size)}, - {Mh_ gzz, static_cast(matrix_size)}, - {Mh_ chix, static_cast(matrix_size)}, - {Mh_ chiy, static_cast(matrix_size)}, - {Mh_ chiz, static_cast(matrix_size)}, - {Mh_ gxxx, static_cast(matrix_size)}, - {Mh_ gxyx, static_cast(matrix_size)}, - {Mh_ gxzx, static_cast(matrix_size)}, - {Mh_ gyyx, static_cast(matrix_size)}, - {Mh_ gyzx, static_cast(matrix_size)}, - {Mh_ gzzx, static_cast(matrix_size)}, - {Mh_ gxxy, static_cast(matrix_size)}, - {Mh_ gxyy, static_cast(matrix_size)}, - {Mh_ gxzy, static_cast(matrix_size)}, - {Mh_ gyyy, static_cast(matrix_size)}, - {Mh_ gyzy, static_cast(matrix_size)}, - {Mh_ gzzy, static_cast(matrix_size)}, - {Mh_ gxxz, static_cast(matrix_size)}, - {Mh_ gxyz, static_cast(matrix_size)}, - {Mh_ gxzz, static_cast(matrix_size)}, - {Mh_ gyyz, static_cast(matrix_size)}, - {Mh_ gyzz, static_cast(matrix_size)}, - {Mh_ gzzz, static_cast(matrix_size)}, - {Mh_ Lapx, static_cast(matrix_size)}, - {Mh_ Lapy, static_cast(matrix_size)}, - {Mh_ Lapz, static_cast(matrix_size)}, - {Mh_ betaxx, static_cast(matrix_size)}, - {Mh_ betaxy, static_cast(matrix_size)}, - {Mh_ betaxz, static_cast(matrix_size)}, - {Mh_ betayy, static_cast(matrix_size)}, - {Mh_ betayz, static_cast(matrix_size)}, - {Mh_ betazz, static_cast(matrix_size)}, - {Mh_ betayx, static_cast(matrix_size)}, - {Mh_ betazy, static_cast(matrix_size)}, - {Mh_ betazx, static_cast(matrix_size)}, - {Mh_ Kx, static_cast(matrix_size)}, - {Mh_ Ky, static_cast(matrix_size)}, - {Mh_ Kz, static_cast(matrix_size)}, - {Mh_ Gamxx, static_cast(matrix_size)}, - {Mh_ Gamxy, static_cast(matrix_size)}, - {Mh_ Gamxz, static_cast(matrix_size)}, - {Mh_ Gamyy, static_cast(matrix_size)}, - {Mh_ Gamyz, static_cast(matrix_size)}, - {Mh_ Gamzz, static_cast(matrix_size)}, - {Mh_ Gamyx, static_cast(matrix_size)}, - {Mh_ Gamzy, static_cast(matrix_size)}, - {Mh_ Gamzx, static_cast(matrix_size)}, - {Mh_ div_beta, static_cast(matrix_size)}, - {Mh_ S, static_cast(matrix_size)}, - {Mh_ f, static_cast(matrix_size)}, - {Mh_ fxx, static_cast(matrix_size)}, - {Mh_ fxy, static_cast(matrix_size)}, - {Mh_ fxz, static_cast(matrix_size)}, - {Mh_ fyy, static_cast(matrix_size)}, - {Mh_ fyz, static_cast(matrix_size)}, - {Mh_ fzz, static_cast(matrix_size)}, - {Mh_ gupxx, static_cast(matrix_size)}, - {Mh_ gupxy, static_cast(matrix_size)}, - {Mh_ gupxz, static_cast(matrix_size)}, - {Mh_ gupyy, static_cast(matrix_size)}, - {Mh_ gupyz, static_cast(matrix_size)}, - {Mh_ gupzz, static_cast(matrix_size)}, - {Mh_ Gamxa, static_cast(matrix_size)}, - {Mh_ Gamya, static_cast(matrix_size)}, - {Mh_ Gamza, static_cast(matrix_size)}, - {Mh_ alpn1, static_cast(matrix_size)}, - {Mh_ chin1, static_cast(matrix_size)}, }; if (!zero_buffers(zero_specs, sizeof(zero_specs) / sizeof(zero_specs[0]))) return 1; @@ -3244,7 +3175,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, cudaMemcpyToSymbol(T_c,&T, sizeof(double)); cudaMemcpyToSymbol(Symmetry_c,&Symmetry, sizeof(int)); cudaMemcpyToSymbol(Lev_c,&Lev, sizeof(int)); - cudaMemcpyToSymbol(co_c,&co, sizeof(int)); + cudaMemcpyToSymbol(co_c,&effective_co, sizeof(int)); cudaMemcpyToSymbol(eps_c,&eps, sizeof(double)); double dXh = X[1] - X[0]; @@ -3341,9 +3272,9 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, /* int deviceCount; cudaGetDeviceCount(&deviceCount); cout<<"myrank is: "<>>(); @@ -3456,7 +3387,7 @@ int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,double *X, double *Y, } - if(co == 0){ + if(effective_co == 0){ compute_rhs_bssn_part7<<>>(); sub_fderivs(Mh_ Axx,Mh_ fh,Mh_ gxxx,Mh_ gxxy,Mh_ gxxz,sss);