diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C index c98bfd8..fa14a2f 100644 --- a/AMSS_NCKU_source/bssn_class.C +++ b/AMSS_NCKU_source/bssn_class.C @@ -89,6 +89,16 @@ bool bssn_cuda_use_resident_sync(int lev) #endif } +bool bssn_constraint_recompute_from_state(int lev, bool level0_cache_valid) +{ +#if USE_CUDA_BSSN + return lev > 0 || !level0_cache_valid; +#else + (void)level0_cache_valid; + return lev > 0; +#endif +} + bool bssn_cuda_sync_subset(Block *cg, int subset_count, const int *state_indices, @@ -361,6 +371,7 @@ bssn_class::bssn_class(double Couranti, double StartTimei, double TotalTimei, int a_levi, int maxli, int decni, double maxrexi, double drexi) : Courant(Couranti), StartTime(StartTimei), TotalTime(TotalTimei), DumpTime(DumpTimei), d2DumpTime(d2DumpTimei), CheckTime(CheckTimei), AnasTime(AnasTimei), + cuda_level0_constraint_cache_valid(false), Symmetry(Symmetryi), checkrun(checkruni), numepss(numepssi), numepsb(numepsbi), numepsh(numepshi), #ifdef With_AHF xc(0), yc(0), zc(0), xr(0), yr(0), zr(0), trigger(0), dTT(0), dumpid(0), @@ -2448,6 +2459,8 @@ void bssn_class::Evolve(int Steps) for (int ncount = 1; ncount < Steps + 1; ncount++) { + cuda_level0_constraint_cache_valid = false; + // special for large mass ratio consideration // if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) // { GH->levels=GH->movls; } @@ -3337,6 +3350,8 @@ void bssn_class::Step(int lev, int YN) #else const bool use_cuda_resident_sync = false; #endif + const bool need_cuda_level0_constraint_cache = + (lev == 0) && (LastConsOut + dT * pow(0.5, Mymax(0, trfls)) >= AnasTime); // new code 2013-2-15, zjcao #if (MAPBH == 1) @@ -3457,6 +3472,22 @@ void bssn_class::Step(int lev, int YN) << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; ERROR = 1; } + if (need_cuda_level0_constraint_cache) + { + double *constraint_out[7] = { + cg->fgfs[Cons_Ham->sgfn], cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn], + cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn]}; + if (bssn_cuda_download_constraint_outputs(cg->shape, constraint_out)) + { + cout << "CUDA predictor constraint download failed in domain: (" + << cg->bbox[0] << ":" << cg->bbox[3] << "," + << cg->bbox[1] << ":" << cg->bbox[4] << "," + << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl; + ERROR = 1; + } + else + cuda_level0_constraint_cache_valid = true; + } used_gpu_substep = true; used_gpu_resident_state = (keep_resident_state != 0); } @@ -7734,7 +7765,7 @@ void bssn_class::Constraint_Out() for (int lev = 0; lev < GH->levels; lev++) { // make sure the data consistent for higher levels - if (lev > 0) // if the constrait quantities can be reused from the step rhs calculation + if (bssn_constraint_recompute_from_state(lev, cuda_level0_constraint_cache_valid)) { double TRK4 = PhysTime; double ndeps = numepsb; @@ -8237,7 +8268,7 @@ void bssn_class::Interp_Constraint(bool infg) for (int lev = 0; lev < GH->levels; lev++) { // make sure the data consistent for higher levels - if (lev > 0) // if the constrait quantities can be reused from the step rhs calculation + if (bssn_constraint_recompute_from_state(lev, cuda_level0_constraint_cache_valid)) { double TRK4 = PhysTime; double ndeps = numepsb; diff --git a/AMSS_NCKU_source/bssn_class.h b/AMSS_NCKU_source/bssn_class.h index 5a8eb2a..2609238 100644 --- a/AMSS_NCKU_source/bssn_class.h +++ b/AMSS_NCKU_source/bssn_class.h @@ -45,10 +45,11 @@ public: int checkrun; char checkfilename[50]; int Steps; - double StartTime, TotalTime; - double AnasTime, DumpTime, d2DumpTime, CheckTime; - double LastAnas, LastConsOut; - double Courant; + double StartTime, TotalTime; + double AnasTime, DumpTime, d2DumpTime, CheckTime; + double LastAnas, LastConsOut; + bool cuda_level0_constraint_cache_valid; + double Courant; double numepss, numepsb, numepsh; int Symmetry; int maxl, decn; diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu index 310f014..ad31c7f 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.cu +++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu @@ -4997,6 +4997,17 @@ static void download_state_outputs(double **state_host_out, size_t all) } } +static void download_constraint_outputs(double **constraint_host_out, size_t all) +{ + const size_t bytes = all * sizeof(double); + CUDA_CHECK(cudaMemcpy(g_buf.h_stage, g_buf.slot[S_ham_Res], + (size_t)D2H_CONSTRAINT_SLOT_COUNT * bytes, + cudaMemcpyDeviceToHost)); + for (int i = 0; i < D2H_CONSTRAINT_SLOT_COUNT; ++i) { + std::memcpy(constraint_host_out[i], g_buf.h_stage + (size_t)i * all, bytes); + } +} + __global__ void kern_pack_state_region_batch(const double * __restrict__ src_mem, double * __restrict__ dst, int nx, int ny, @@ -5818,6 +5829,17 @@ int bssn_cuda_download_resident_state(void *block_tag, return 0; } +extern "C" +int bssn_cuda_download_constraint_outputs(int *ex, + double **constraint_host_out) +{ + init_gpu_dispatch(); + CUDA_CHECK(cudaSetDevice(g_dispatch.my_device)); + const size_t all = (size_t)ex[0] * ex[1] * ex[2]; + download_constraint_outputs(constraint_host_out, all); + return 0; +} + extern "C" int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag, int state_index, diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h index ce8a334..55b6380 100644 --- a/AMSS_NCKU_source/bssn_rhs_cuda.h +++ b/AMSS_NCKU_source/bssn_rhs_cuda.h @@ -73,6 +73,9 @@ int bssn_cuda_download_resident_state(void *block_tag, int *ex, double **state_host_out); +int bssn_cuda_download_constraint_outputs(int *ex, + double **constraint_host_out); + int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag, int state_index, double *host_buffer,