Compare commits

..

41 Commits

Author SHA1 Message Date
a0dab90bcb Switch to NVIDIA HPC Toolchain 2026-04-29 08:31:49 +08:00
c689cc8dc9 [WIP] Add CUDA support for Z4C
Rewritten done by Codex.
This still has errors, do not pick this one now.
2026-04-27 11:58:43 +08:00
60fee8f1c1 Fix Z4C C++ gauge damping ordering 2026-04-26 15:38:13 +08:00
843b116954 Add C++ Z4C RHS path and port some BSSN optimizations 2026-04-25 10:39:01 +08:00
c768e1220b Also disable cached sync for Z4C 2026-04-25 10:25:54 +08:00
02f149e2e3 Disable cached sync for BSSN-EScalar 2026-04-25 10:17:47 +08:00
422e8ec4dc Fallback BSSN-EScalar restrict/prolong path 2026-04-25 10:10:34 +08:00
c4909b9843 更新精度检查脚本加入图像比对检查
(cherry picked from commit ac82ebd889)
2026-04-25 09:40:12 +08:00
f521a97563 Fix ABE CPU version build error 2026-04-25 09:39:49 +08:00
53c55451b3 Update makefile and scripts for CUDA BSSN configuration and build commands 2026-04-25 09:19:50 +08:00
768345954f Add optional BSSN kernel profiling switches
(cherry picked from commit 9c31384b2f)
2026-04-25 08:39:43 +08:00
9a6df6438b Remove dead chi derivative setup in BSSN RHS
(cherry picked from commit e4e741caa1)
2026-04-25 08:38:01 +08:00
8e9463aa90 Localize chi Ricci intermediates in RHS
(cherry picked from commit 65e0f95f40)
2026-04-25 08:37:41 +08:00
7c6f15002e Elide dead stores in BSSN RHS hot path
(cherry picked from commit f9fbf97e64)
2026-04-25 08:37:40 +08:00
6410c62e3e Add fine-grained step timing and trim BH RHS overhead
(cherry picked from commit 968522995b)
2026-04-25 08:37:19 +08:00
11977eb82f Merge wave and mass extraction interpolation
(cherry picked from commit f3988ac8ca)
2026-04-25 08:25:34 +08:00
cce8a44fc4 Cache wave extraction angular kernels
(cherry picked from commit e4c25eb21f)
2026-04-25 08:24:36 +08:00
c589097618 Reuse mass integrand across detector radii
(cherry picked from commit 4b10519876)
2026-04-25 08:24:11 +08:00
b713e5a9be Batch constraint norm reductions
(cherry picked from commit 3a58273501)
2026-04-25 08:22:00 +08:00
0396701572 Optimize constraint refresh after regrid
(cherry picked from commit 5c65cea2f0)
2026-04-25 08:18:51 +08:00
bb20c9a876 fix ADM Constrant Violation Analysis 2026-04-15 19:19:16 +08:00
8fe60ea703 Add zero matter handling and interpolation for resident state in CUDA BSSN 2026-04-15 00:25:53 +08:00
9ab7e7c7f9 Fuse phases 5 and 6 for Gamma_rhs computation and optimize phases 8 and 9 for efficiency 2026-04-14 23:23:04 +08:00
f9119e8a2a Add resident-GA mode switch and simplify sync logic 2026-04-14 21:09:27 +08:00
726d743376 Fuse Ricci assembly and optimize trK/Aij gauge kernels 2026-04-14 19:20:12 +08:00
af344bf1e5 Add Phase-10 Ricci kernels and batch launch flow 2026-04-14 19:00:22 +08:00
7191fc0b96 Move resident sync comm buffers into StepAllocation pool 2026-04-13 21:04:44 +08:00
b3ec244cf9 Add batched first/second derivative kernels for CUDA RHS 2026-04-13 20:51:08 +08:00
e952ee8e91 Batch GA/BH subset sync with indexed GPU pack/unpack buffers 2026-04-13 20:40:09 +08:00
c5d1268dd1 Batch patch-boundary copy and gate CPU BC in GPU substeps 2026-04-13 11:52:17 +08:00
4bdfc90f22 Pass pointer tables as kernel args and skip redundant symbol uploads 2026-04-13 11:19:00 +08:00
c49a4e00c9 Batch symbd_pack/lopsided/kodiss over all state variables 2026-04-13 11:02:55 +08:00
1b3c0b80d2 Refactor CUDA step buffers to remove loop-time allocations 2026-04-13 10:33:03 +08:00
636e35bfd8 Add direct CUDA resident-state sync path and profiling hooks 2026-04-13 00:57:05 +08:00
7f2a391dd2 Cache matter fields in StepContext across RK4 substeps 2026-04-12 22:19:45 +08:00
4fa12a2009 Integrate CUDA support into RK4 substep execution 2026-04-12 22:11:44 +08:00
86a683de26 Replace legacy ABEGPU stack with ABE_CUDA backend 2026-04-12 21:19:14 +08:00
aaf7bf0a26 Merge remote-tracking branch 'origin/main' 2026-04-12 20:55:42 +08:00
9c44d1c885 fix(bssn_rhs) 2026-03-03 16:00:45 +08:00
4b9de28feb 将 Restrict/Prolong 链路里的 coarse-level Sync_cached 改为可选(默认跳过)
OutBdLow2Hi_cached 读的是 coarse owned 区域(非 coarse ghost/buffer)
回退旧行为:编译时定义 RP_SYNC_COARSE_AFTER_RESTRICT=1
2026-03-03 14:25:27 +08:00
4eb5dc4ddb 删除重复的一次 chi 一阶导计算 2026-03-03 14:23:56 +08:00
30 changed files with 24813 additions and 22783 deletions

View File

@@ -16,9 +16,9 @@ import numpy
File_directory = "GW150914" ## output file directory
Output_directory = "binary_output" ## binary data file directory
## The file directory name should not be too long
MPI_processes = 64 ## number of mpi processes used in the simulation
MPI_processes = 8 ## number of mpi processes used in the simulation
GPU_Calculation = "no" ## Use GPU or not
GPU_Calculation = "yes" ## Use GPU or not
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
CPU_Part = 1.0
GPU_Part = 0.0

View File

@@ -174,14 +174,11 @@ import generate_macrodef
generate_macrodef.generate_macrodef_h()
print( " AMSS-NCKU macro file macrodef.h has been generated. " )
generate_macrodef.generate_macrodef_fh()
print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
generate_macrodef.generate_build_config()
print( " AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. " )
##################################################################
generate_macrodef.generate_macrodef_fh()
print( " AMSS-NCKU macro file macrodef.fh has been generated. " )
##################################################################
# Compile the AMSS-NCKU program according to user requirements
@@ -220,13 +217,11 @@ shutil.copytree(AMSS_NCKU_source_path, AMSS_NCKU_source_copy)
# Copy the generated macro files into the AMSS_NCKU source folder
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy)
shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
macrodef_h_path = os.path.join(File_directory, "macrodef.h")
macrodef_fh_path = os.path.join(File_directory, "macrodef.fh")
shutil.copy2(macrodef_h_path, AMSS_NCKU_source_copy)
shutil.copy2(macrodef_fh_path, AMSS_NCKU_source_copy)
# Notes on copying files:
# shutil.copy2 preserves file metadata such as modification time.
@@ -263,7 +258,7 @@ print()
if (input_data.GPU_Calculation == "no"):
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
elif (input_data.GPU_Calculation == "yes"):
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE_CUDA")
if not os.path.exists( ABE_file ):
print( )

File diff suppressed because it is too large Load Diff

View File

@@ -100,12 +100,14 @@ namespace Parallel
MyList<gridseg> **combined_dst;
int *send_lengths;
int *recv_lengths;
double **send_bufs;
double **recv_bufs;
int *send_buf_caps;
int *recv_buf_caps;
MPI_Request *reqs;
MPI_Status *stats;
double **send_bufs;
double **recv_bufs;
int *send_buf_caps;
int *recv_buf_caps;
unsigned char *send_buf_pinned;
unsigned char *recv_buf_pinned;
MPI_Request *reqs;
MPI_Status *stats;
int max_reqs;
bool lengths_valid;
int *tc_req_node;
@@ -116,10 +118,11 @@ namespace Parallel
void destroy();
};
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
MyList<var> *VarList1, MyList<var> *VarList2,
int Symmetry, SyncCache &cache);
struct AsyncSyncState {
int req_no;

View File

@@ -26,12 +26,20 @@ using namespace std;
#include "shellfunctions.h"
#include "cpbc.h"
#include "kodiss.h"
#include "parameters.h"
#ifdef With_AHF
#include "derivatives.h"
#include "myglobal.h"
#endif
#include "parameters.h"
#ifndef USE_CUDA_Z4C
#define USE_CUDA_Z4C 0
#endif
#if USE_CUDA_Z4C && (ABEtype == 2)
#include "z4c_rhs_cuda.h"
#endif
#ifdef With_AHF
#include "derivatives.h"
#include "myglobal.h"
#endif
//================================================================================================
@@ -167,12 +175,554 @@ Z4c_class::~Z4c_class()
#define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
#ifndef CPBC
// for sommerfeld boundary
void Z4c_class::Step(int lev, int YN)
{
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#ifndef CPBC
// for sommerfeld boundary
#if USE_CUDA_Z4C && (ABEtype == 2)
#ifdef WithShell
#error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only"
#endif
#if (MRBD == 2)
#error "USE_CUDA_Z4C resident path does not support MRBD == 2"
#endif
namespace {
static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20};
bool fill_z4c_cuda_views(Block *cg, MyList<var> *vars,
double **host_views,
double *propspeeds = 0,
double *soa_flat = 0)
{
int idx = 0;
while (vars && idx < Z4C_CUDA_STATE_COUNT)
{
host_views[idx] = cg->fgfs[vars->data->sgfn];
if (propspeeds)
propspeeds[idx] = vars->data->propspeed;
if (soa_flat)
{
soa_flat[3 * idx + 0] = vars->data->SoA[0];
soa_flat[3 * idx + 1] = vars->data->SoA[1];
soa_flat[3 * idx + 2] = vars->data->SoA[2];
}
vars = vars->next;
++idx;
}
return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
}
void z4c_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
{
MyList<Patch> *Pp = PatL;
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
{
double *state_out[Z4C_CUDA_STATE_COUNT];
if (!fill_z4c_cuda_views(cg, vars, state_out))
{
cout << "CUDA Z4C state list mismatch on resident state download" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
{
cout << "CUDA Z4C resident state download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
if (release_ctx)
z4c_cuda_release_step_ctx(cg);
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
}
bool z4c_cuda_patch_contains_point(Patch *patch, const double *point)
{
if (!patch)
return false;
for (int d = 0; d < dim; d++)
{
const double h = patch->getdX(d);
const double lo = patch->bbox[d] + patch->lli[d] * h;
const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
if (point[d] < lo || point[d] > hi)
return false;
}
return true;
}
bool z4c_cuda_point_in_block(Patch *patch, Block *block,
const double *point, const double *DH)
{
if (!patch || !block)
return false;
for (int d = 0; d < dim; d++)
{
double llb;
double uub;
#ifdef Vertex
#ifdef Cell
#error Both Cell and Vertex are defined
#endif
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
? block->bbox[d] + patch->lli[d] * DH[d]
: block->bbox[d] + (ghost_width - 0.5) * DH[d];
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
? block->bbox[dim + d] - patch->uui[d] * DH[d]
: block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
#else
#ifdef Cell
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
? block->bbox[d] + patch->lli[d] * DH[d]
: block->bbox[d] + ghost_width * DH[d];
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
? block->bbox[dim + d] - patch->uui[d] * DH[d]
: block->bbox[dim + d] - ghost_width * DH[d];
#else
#error Not define Vertex nor Cell
#endif
#endif
if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
return false;
}
return true;
}
int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
{
if (!coords || n <= ordn)
return 0;
int cxi = int((x - coords[0]) / dx + 0.4) + 1;
int start = cxi - ordn / 2;
if (start < 0)
start = 0;
const int max_start = n - ordn;
if (start > max_start)
start = max_start;
return start;
}
bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
int myrank,
const double *point,
var *forx, var *fory, var *forz,
int Symmetry,
double *shellf)
{
const int ordn = 2 * ghost_width;
int owner_rank = -1;
shellf[0] = shellf[1] = shellf[2] = 0.0;
MyList<Patch> *PL = PatL;
while (PL)
{
Patch *patch = PL->data;
if (!z4c_cuda_patch_contains_point(patch, point))
{
PL = PL->next;
continue;
}
double DH[dim];
for (int d = 0; d < dim; d++)
DH[d] = patch->getdX(d);
MyList<Block> *BP = patch->blb;
while (BP)
{
Block *block = BP->data;
if (z4c_cuda_point_in_block(patch, block, point, DH))
{
owner_rank = block->rank;
if (myrank == owner_rank)
{
int interp_ordn = ordn;
int interp_sym = Symmetry;
double x = point[0];
double y = point[1];
double z = point[2];
if (z4c_cuda_has_resident_state(block) &&
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
{
const int sx = ordn;
const int sy = ordn;
const int sz = ordn;
const int region_all = sx * sy * sz;
const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
double *packed_fields = new double[3 * region_all];
var *vars[3] = {forx, fory, forz};
for (int f = 0; f < 3; f++)
{
if (z4c_cuda_pack_state_region_to_host_buffer(block,
k_z4c_cuda_bh_state_indices[f],
packed_fields + f * region_all,
block->shape,
i0, j0, k0,
sx, sy, sz) != 0)
{
delete[] packed_fields;
cout << "CUDA Z4C BH tile download failed" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int tile_shape[3] = {sx, sy, sz};
f_global_interp(tile_shape,
block->X[0] + i0,
block->X[1] + j0,
block->X[2] + k0,
packed_fields + f * region_all,
shellf[f],
x, y, z,
interp_ordn,
vars[f]->SoA,
interp_sym);
}
delete[] packed_fields;
}
else
{
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->fgfs[forx->sgfn], shellf[0],
x, y, z, interp_ordn, forx->SoA, interp_sym);
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->fgfs[fory->sgfn], shellf[1],
x, y, z, interp_ordn, fory->SoA, interp_sym);
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
block->fgfs[forz->sgfn], shellf[2],
x, y, z, interp_ordn, forz->SoA, interp_sym);
}
}
break;
}
if (BP == patch->ble)
break;
BP = BP->next;
}
if (owner_rank >= 0)
break;
PL = PL->next;
}
if (owner_rank < 0)
return false;
MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
return true;
}
bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
int ilev,
int myrank,
int BH_num,
double **BH_PS,
double **BH_RHS,
var *forx, var *fory, var *forz,
int Symmetry)
{
for (int n = 0; n < BH_num; n++)
{
double shellf[3] = {0.0, 0.0, 0.0};
int lev = ilev;
while (lev >= 0 &&
!z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n],
forx, fory, forz, Symmetry, shellf))
{
--lev;
}
if (lev < 0)
return false;
BH_RHS[n][0] = -shellf[0];
BH_RHS[n][1] = -shellf[1];
BH_RHS[n][2] = -shellf[2];
}
return true;
}
} // namespace
#endif
void Z4c_class::Step(int lev, int YN)
{
#if USE_CUDA_Z4C && (ABEtype == 2)
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
double ndeps = numepss;
if (lev < GH->movls)
ndeps = numepsb;
double TRK4 = PhysTime;
int iter_count = 0;
int pre = 0, cor = 1;
int ERROR = 0;
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double *state_in[Z4C_CUDA_STATE_COUNT];
double *state_out[Z4C_CUDA_STATE_COUNT];
double propspeed[Z4C_CUDA_STATE_COUNT];
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
!fill_z4c_cuda_views(cg, SynchList_pre, state_out))
{
cout << "CUDA Z4C state list mismatch on predictor step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
#if (MRBD == 0)
#if (SommerType == 0)
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#elif (MRBD == 1)
apply_bam_bc = 1;
#endif
int keep_resident_state = 1;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#endif
if (z4c_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, pre,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA Z4C predictor substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
if (myrank == 0 && ErrorMonitor->outfile)
ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
if (BH_num > 0 && lev == GH->levels - 1)
{
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
if (Symmetry == 2)
{
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
}
}
}
if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
if (lev == a_lev)
AnalysisStuff(lev, dT_lev);
for (iter_count = 1; iter_count < 4; iter_count++)
{
if (iter_count == 1 || iter_count == 3)
TRK4 += dT_lev / 2;
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
if (myrank == cg->rank)
{
double *state_in[Z4C_CUDA_STATE_COUNT];
double *state_out[Z4C_CUDA_STATE_COUNT];
double propspeed[Z4C_CUDA_STATE_COUNT];
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
!fill_z4c_cuda_views(cg, SynchList_cor, state_out))
{
cout << "CUDA Z4C state list mismatch on corrector step" << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
int apply_bam_bc = 0;
#if (MRBD == 0)
#if (SommerType == 0)
apply_bam_bc = (lev == 0) ? 1 : 0;
#endif
#elif (MRBD == 1)
apply_bam_bc = 1;
#endif
int keep_resident_state = 1;
int apply_enforce_ga = 0;
#if (AGM == 0)
apply_enforce_ga = 1;
#elif (AGM == 1)
apply_enforce_ga = (iter_count == 3) ? 1 : 0;
#endif
if (z4c_cuda_rk4_substep(cg,
cg->shape, cg->X[0], cg->X[1], cg->X[2],
state_in, state_out,
propspeed, soa_flat, Pp->data->bbox,
dT_lev, TRK4, iter_count, apply_bam_bc,
Symmetry, lev, ndeps, cor,
keep_resident_state, apply_enforce_ga, chitiny))
{
cout << "CUDA Z4C corrector substep failed in domain: ("
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
ERROR = 1;
}
}
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
{
int erh = ERROR;
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
}
if (ERROR)
{
if (myrank == 0 && ErrorMonitor->outfile)
ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count
<< " at t = " << PhysTime
<< ", lev = " << lev << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
if (BH_num > 0 && lev == GH->levels - 1)
{
if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
Porg, Porg1,
Sfx, Sfy, Sfz, Symmetry))
{
if (myrank == 0 && ErrorMonitor->outfile)
ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
<< PhysTime << endl;
MPI_Abort(MPI_COMM_WORLD, 1);
}
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
if (Symmetry > 0)
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
if (Symmetry == 2)
{
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
}
}
}
if (iter_count < 3)
{
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
cg->swapList(SynchList_pre, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg[ithBH][0] = Porg1[ithBH][0];
Porg[ithBH][1] = Porg1[ithBH][1];
Porg[ithBH][2] = Porg1[ithBH][2];
}
}
}
}
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
#if (RPS == 0)
RestrictProlong(lev, YN, BB);
#endif
Pp = GH->PatL[lev];
while (Pp)
{
MyList<Block> *BP = Pp->data->blb;
while (BP)
{
Block *cg = BP->data;
cg->swapList(StateList, SynchList_cor, myrank);
cg->swapList(OldStateList, SynchList_cor, myrank);
if (BP == Pp->data->ble)
break;
BP = BP->next;
}
Pp = Pp->next;
}
if (BH_num > 0 && lev == GH->levels - 1)
{
for (int ithBH = 0; ithBH < BH_num; ithBH++)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
#else
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
#ifdef With_AHF
AH_Step_Find(lev, dT_lev);
#endif
@@ -1039,15 +1589,19 @@ void Z4c_class::Step(int lev, int YN)
{
Porg0[ithBH][0] = Porg1[ithBH][0];
Porg0[ithBH][1] = Porg1[ithBH][1];
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
}
#else
// for constraint preserving boundary (CPBC)
#ifndef WithShell
#error "CPBC only supports Shell"
#endif
Porg0[ithBH][2] = Porg1[ithBH][2];
}
}
#endif
}
#else
// for constraint preserving boundary (CPBC)
#if USE_CUDA_Z4C && (ABEtype == 2)
#error "USE_CUDA_Z4C resident path does not support CPBC"
#endif
#ifndef WithShell
#error "CPBC only supports Shell"
#endif
// 0: extroplate rhs, 1: extroplate variable
// 2: extroplate variable but before RHS calculation

View File

@@ -94,29 +94,31 @@
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry,Lev,eps,co)
#if (ABV == 0)
call ricci_gamma(ex, X, Y, Z, &
chi, &
dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry)
#endif
call constraint_bssn(ex, X, Y, Z,&
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry)
if (co == 0) then
#if (ABV == 0)
call ricci_gamma(ex, X, Y, Z, &
chi, &
dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry)
#endif
call constraint_bssn(ex, X, Y, Z,&
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry)
endif
return
@@ -226,11 +228,12 @@
call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
!!! sanity check
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
!!! sanity check
#ifdef DEBUG
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
+sum(TZ)
if(dX.ne.dX) then
if(sum(chi).ne.sum(chi))write(*,*)"Z4c_rhs.f90: find NaN in chi"
@@ -257,10 +260,11 @@
if(sum(dtSfx).ne.sum(dtSfx))write(*,*)"Z4c_rhs.f90: find NaN in dtSfx"
if(sum(dtSfy).ne.sum(dtSfy))write(*,*)"Z4c_rhs.f90: find NaN in dtSfy"
if(sum(dtSfz).ne.sum(dtSfz))write(*,*)"Z4c_rhs.f90: find NaN in dtSfz"
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs.f90: find NaN in TZ"
gont = 1
return
endif
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs.f90: find NaN in TZ"
gont = 1
return
endif
#endif
PI = dacos(-ONE)
@@ -1263,30 +1267,32 @@
endif
#if (ABV == 0)
call ricci_gamma(ex, X, Y, Z, &
chi, &
dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry)
#endif
call constraint_bssn(ex, X, Y, Z,&
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry)
if (co == 0) then
#if (ABV == 0)
call ricci_gamma(ex, X, Y, Z, &
chi, &
dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry)
#endif
call constraint_bssn(ex, X, Y, Z,&
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry)
endif
gont = 0

View File

@@ -121,11 +121,12 @@
call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
!!! sanity check
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
!!! sanity check
#ifdef DEBUG
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
+sum(TZ)
if(dX.ne.dX) then
if(sum(chi).ne.sum(chi))write(*,*)"Z4c_rhs_ss.f90: find NaN in chi"
@@ -152,10 +153,11 @@
if(sum(dtSfx).ne.sum(dtSfx))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfx"
if(sum(dtSfy).ne.sum(dtSfy))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfy"
if(sum(dtSfz).ne.sum(dtSfz))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfz"
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs_ss.f90: find NaN in TZ"
gont = 1
return
endif
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs_ss.f90: find NaN in TZ"
gont = 1
return
endif
#endif
PI = dacos(-ONE)
@@ -1388,41 +1390,43 @@
call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst)
endif
#if (ABV == 1)
call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi, &
dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry,Lev,sst)
call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry,Lev,sst)
#endif
if (co == 0) then
#if (ABV == 1)
call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi, &
dxx , gxy , gxz , dyy , gyz , dzz,&
Gamx , Gamy , Gamz , &
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
Symmetry,Lev,sst)
#endif
call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z, &
drhodx, drhody, drhodz, &
dsigmadx,dsigmady,dsigmadz, &
dRdx,dRdy,dRdz, &
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
chi,trK, &
dxx,gxy,gxz,dyy,gyz,dzz, &
Axx,Axy,Axz,Ayy,Ayz,Azz, &
Gamx,Gamy,Gamz,&
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
Symmetry,Lev,sst)
endif
gont = 0

View File

@@ -258,8 +258,6 @@ void bssnEM_class::Initialize()
PhysTime = StartTime;
Setup_Black_Hole_position();
}
setup_transfer_caches();
}
//================================================================================================

View File

@@ -23,14 +23,8 @@ using namespace std;
#include "rungekutta4_rout.h"
#include "sommerfeld_rout.h"
#include "getnp4.h"
#include "shellfunctions.h"
#include "parameters.h"
#if BSSN_USE_ESCALAR_C_KERNEL
#define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar_c
#else
#define BSSN_ESCALAR_RHS f_compute_rhs_bssn_escalar
#endif
#include "shellfunctions.h"
#include "parameters.h"
#ifdef With_AHF
#include "derivatives.h"
@@ -80,8 +74,8 @@ bssnEScalar_class::bssnEScalar_class(double Couranti, double StartTimei, double
//================================================================================================
void bssnEScalar_class::Initialize()
{
void bssnEScalar_class::Initialize()
{
Sphio = new var("Sphio", ngfs++, 1, 1, 1);
Spio = new var("Spio", ngfs++, 1, 1, 1);
Sphi0 = new var("Sphi0", ngfs++, 1, 1, 1);
@@ -138,14 +132,11 @@ void bssnEScalar_class::Initialize()
}
}
GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
ConstraintRefreshLevels = new int[GH->levels];
for (int il = 0; il < GH->levels; il++)
ConstraintRefreshLevels[il] = 0;
if (checkrun)
CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
else
GH->compose_cgh(nprocs);
GH = new cgh(0, ngfs, Symmetry, pname, checkrun, ErrorMonitor);
if (checkrun)
CheckPoint->readcheck_cgh(PhysTime, GH, myrank, nprocs, Symmetry);
else
GH->compose_cgh(nprocs);
#ifdef WithShell
SH = new ShellPatch(0, ngfs, pname, Symmetry, myrank, ErrorMonitor);
@@ -169,14 +160,12 @@ void bssnEScalar_class::Initialize()
{
CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs);
}
else
{
PhysTime = StartTime;
Setup_Black_Hole_position();
}
setup_transfer_caches();
}
else
{
PhysTime = StartTime;
Setup_Black_Hole_position();
}
}
//================================================================================================
@@ -218,10 +207,10 @@ bssnEScalar_class::~bssnEScalar_class()
// Read initial data solved by Ansorg, PRD 70, 064011 (2004)
void bssnEScalar_class::Read_Ansorg()
{
if (!checkrun)
{
void bssnEScalar_class::Read_Ansorg()
{
if (!checkrun)
{
if (myrank == 0)
cout << "Read initial data from Ansorg's solver,"
<< " please be sure the input parameters for black holes are puncture parameters!!"
@@ -238,12 +227,9 @@ void bssnEScalar_class::Read_Ansorg()
cout << "Error inputpar" << endl;
exit(0);
}
}
int BH_NM;
double *Porg_here;
double *pmom_local;
double *spin_local;
double *mass_local;
}
int BH_NM;
double *Porg_here;
// read parameter from file
{
const int LEN = 256;
@@ -283,11 +269,11 @@ void bssnEScalar_class::Read_Ansorg()
}
inf.close();
}
Porg_here = new double[3 * BH_NM];
pmom_local = new double[3 * BH_NM];
spin_local = new double[3 * BH_NM];
mass_local = new double[BH_NM];
Porg_here = new double[3 * BH_NM];
Pmom = new double[3 * BH_NM];
Spin = new double[3 * BH_NM];
Mass = new double[BH_NM];
// read parameter from file
{
const int LEN = 256;
@@ -319,37 +305,37 @@ void bssnEScalar_class::Read_Ansorg()
else if (status == 0)
continue;
if (sgrp == "BSSN" && sind < BH_NM)
{
if (skey == "Mass")
mass_local[sind] = atof(sval.c_str());
else if (skey == "Porgx")
Porg_here[sind * 3] = atof(sval.c_str());
else if (skey == "Porgy")
Porg_here[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Porgz")
Porg_here[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Spinx")
spin_local[sind * 3] = atof(sval.c_str());
else if (skey == "Spiny")
spin_local[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Spinz")
spin_local[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Pmomx")
pmom_local[sind * 3] = atof(sval.c_str());
else if (skey == "Pmomy")
pmom_local[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Pmomz")
pmom_local[sind * 3 + 2] = atof(sval.c_str());
}
}
inf.close();
if (sgrp == "BSSN" && sind < BH_NM)
{
if (skey == "Mass")
Mass[sind] = atof(sval.c_str());
else if (skey == "Porgx")
Porg_here[sind * 3] = atof(sval.c_str());
else if (skey == "Porgy")
Porg_here[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Porgz")
Porg_here[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Spinx")
Spin[sind * 3] = atof(sval.c_str());
else if (skey == "Spiny")
Spin[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Spinz")
Spin[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Pmomx")
Pmom[sind * 3] = atof(sval.c_str());
else if (skey == "Pmomy")
Pmom[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Pmomz")
Pmom[sind * 3 + 2] = atof(sval.c_str());
}
}
inf.close();
}
int order = 6;
Ansorg read_ansorg("Ansorg.psid", order);
// set initial data
for (int lev = 0; lev < GH->levels; lev++)
{
int order = 6;
Ansorg read_ansorg("Ansorg.psid", order);
// set initial data
for (int lev = 0; lev < GH->levels; lev++)
{
MyList<Patch> *Pp = GH->PatL[lev];
while (Pp)
{
@@ -372,21 +358,21 @@ void bssnEScalar_class::Read_Ansorg()
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM);
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
Mass, Porg_here, Pmom, Spin, BH_NM);
}
if (BL == Pp->data->ble)
break;
BL = BL->next;
}
Pp = Pp->next;
}
}
#ifdef WithShell
// ShellPatch part
}
Pp = Pp->next;
}
}
#ifdef WithShell
// ShellPatch part
MyList<ss_patch> *Pp = SH->PatL;
while (Pp)
{
@@ -414,28 +400,25 @@ void bssnEScalar_class::Read_Ansorg()
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM);
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
Mass, Porg_here, Pmom, Spin, BH_NM);
}
if (BL == Pp->data->ble)
break;
BL = BL->next;
}
Pp = Pp->next;
}
#endif
delete[] Porg_here;
delete[] pmom_local;
delete[] spin_local;
delete[] mass_local;
// dump read_in initial data
// for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT);
}
}
}
Pp = Pp->next;
}
#endif
delete[] Porg_here;
// dump read_in initial data
// for(int lev=0;lev<GH->levels;lev++) Parallel::Dump_Data(GH->PatL[lev],StateList,0,PhysTime,dT);
}
}
//================================================================================================
@@ -449,10 +432,10 @@ void bssnEScalar_class::Read_Ansorg()
// Read initial data solved by Pablo's Olliptic Phys.Rev.D 82 024005 (2010)
void bssnEScalar_class::Read_Pablo()
{
if (!checkrun)
{
void bssnEScalar_class::Read_Pablo()
{
if (!checkrun)
{
if (myrank == 0)
cout << "Read initial data from Pablo's solver,"
<< " please be sure the input parameters for black holes are puncture parameters!!"
@@ -469,12 +452,9 @@ void bssnEScalar_class::Read_Pablo()
cout << "Error inputpar" << endl;
exit(0);
}
}
int BH_NM;
double *Porg_here;
double *pmom_local;
double *spin_local;
double *mass_local;
}
int BH_NM;
double *Porg_here;
// read parameter from file
{
const int LEN = 256;
@@ -514,11 +494,11 @@ void bssnEScalar_class::Read_Pablo()
}
inf.close();
}
Porg_here = new double[3 * BH_NM];
pmom_local = new double[3 * BH_NM];
spin_local = new double[3 * BH_NM];
mass_local = new double[BH_NM];
Porg_here = new double[3 * BH_NM];
Pmom = new double[3 * BH_NM];
Spin = new double[3 * BH_NM];
Mass = new double[BH_NM];
// read parameter from file
{
const int LEN = 256;
@@ -550,31 +530,31 @@ void bssnEScalar_class::Read_Pablo()
else if (status == 0)
continue;
if (sgrp == "BSSN" && sind < BH_NM)
{
if (skey == "Mass")
mass_local[sind] = atof(sval.c_str());
else if (skey == "Porgx")
Porg_here[sind * 3] = atof(sval.c_str());
else if (skey == "Porgy")
Porg_here[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Porgz")
Porg_here[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Spinx")
spin_local[sind * 3] = atof(sval.c_str());
else if (skey == "Spiny")
spin_local[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Spinz")
spin_local[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Pmomx")
pmom_local[sind * 3] = atof(sval.c_str());
else if (skey == "Pmomy")
pmom_local[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Pmomz")
pmom_local[sind * 3 + 2] = atof(sval.c_str());
}
}
inf.close();
if (sgrp == "BSSN" && sind < BH_NM)
{
if (skey == "Mass")
Mass[sind] = atof(sval.c_str());
else if (skey == "Porgx")
Porg_here[sind * 3] = atof(sval.c_str());
else if (skey == "Porgy")
Porg_here[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Porgz")
Porg_here[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Spinx")
Spin[sind * 3] = atof(sval.c_str());
else if (skey == "Spiny")
Spin[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Spinz")
Spin[sind * 3 + 2] = atof(sval.c_str());
else if (skey == "Pmomx")
Pmom[sind * 3] = atof(sval.c_str());
else if (skey == "Pmomy")
Pmom[sind * 3 + 1] = atof(sval.c_str());
else if (skey == "Pmomz")
Pmom[sind * 3 + 2] = atof(sval.c_str());
}
}
inf.close();
}
bool flag = false;
int DIM = dim;
@@ -614,11 +594,11 @@ void bssnEScalar_class::Read_Pablo()
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM);
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
Mass, Porg_here, Pmom, Spin, BH_NM);
}
if (BL == Pp->data->ble)
break;
@@ -678,11 +658,11 @@ void bssnEScalar_class::Read_Pablo()
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
mass_local, Porg_here, pmom_local, spin_local, BH_NM);
cg->fgfs[Lap0->sgfn],
cg->fgfs[Sfx0->sgfn], cg->fgfs[Sfy0->sgfn], cg->fgfs[Sfz0->sgfn],
cg->fgfs[dtSfx0->sgfn], cg->fgfs[dtSfy0->sgfn], cg->fgfs[dtSfz0->sgfn],
cg->fgfs[Sphi0->sgfn], cg->fgfs[Spi0->sgfn],
Mass, Porg_here, Pmom, Spin, BH_NM);
}
if (BL == Pp->data->ble)
break;
@@ -704,13 +684,10 @@ void bssnEScalar_class::Read_Pablo()
Pp = Pp->next;
}
#endif
delete[] Porg_here;
delete[] pmom_local;
delete[] spin_local;
delete[] mass_local;
if (flag && myrank == 0)
MPI_Abort(MPI_COMM_WORLD, 1);
delete[] Porg_here;
if (flag && myrank == 0)
MPI_Abort(MPI_COMM_WORLD, 1);
// dump read_in initial data
for (int lev = 0; lev < GH->levels; lev++)
Parallel::Dump_Data(GH->PatL[lev], StateList, 0, PhysTime, dT);
@@ -762,10 +739,10 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
#endif
if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
@@ -1016,12 +993,11 @@ void bssnEScalar_class::Step(int lev, int YN)
}
#endif
Parallel::AsyncSyncState async_pre;
sync_predictor_start(lev, SynchList_pre, async_pre);
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
#ifdef WithShell
if (lev == 0)
{
#ifdef WithShell
if (lev == 0)
{
clock_t prev_clock, curr_clock;
if (myrank == 0)
curr_clock = clock();
@@ -1033,10 +1009,9 @@ void bssnEScalar_class::Step(int lev, int YN)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
sync_predictor_finish(lev, async_pre, SynchList_pre);
}
}
#endif
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
@@ -1106,10 +1081,10 @@ void bssnEScalar_class::Step(int lev, int YN)
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
#endif
if (BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
if (f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn],
cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn],
cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn],
cg->fgfs[Gmx->sgfn], cg->fgfs[Gmy->sgfn], cg->fgfs[Gmz->sgfn],
@@ -1374,12 +1349,11 @@ void bssnEScalar_class::Step(int lev, int YN)
}
#endif
Parallel::AsyncSyncState async_cor;
sync_corrector_start(lev, SynchList_cor, async_cor);
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
#ifdef WithShell
if (lev == 0)
{
#ifdef WithShell
if (lev == 0)
{
clock_t prev_clock, curr_clock;
if (myrank == 0)
curr_clock = clock();
@@ -1391,10 +1365,9 @@ void bssnEScalar_class::Step(int lev, int YN)
cout << " Shell stuff synchronization used "
<< (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC)
<< " seconds! " << endl;
}
}
#endif
sync_corrector_finish(lev, async_cor, SynchList_cor);
}
}
#endif
// for black hole position
if (BH_num > 0 && lev == GH->levels - 1)
{
@@ -1862,14 +1835,11 @@ void bssnEScalar_class::AnalysisStuff_EScalar(int lev, double dT_lev)
//================================================================================================
void bssnEScalar_class::Interp_Constraint(bool infg)
{
if (!infg)
return;
// we do not support a_lev != 0 yet.
if (a_lev > 0)
return;
void bssnEScalar_class::Interp_Constraint()
{
// we do not support a_lev != 0 yet.
if (a_lev > 0)
return;
for (int lev = 0; lev < GH->levels; lev++)
{
@@ -1888,10 +1858,10 @@ void bssnEScalar_class::Interp_Constraint(bool infg)
if (myrank == cg->rank)
{
if (lev > 0)
BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],
@@ -2108,10 +2078,10 @@ void bssnEScalar_class::Constraint_Out()
if (myrank == cg->rank)
{
if (lev > 0)
BSSN_ESCALAR_RHS(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
f_compute_rhs_bssn_escalar(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn],
cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn],
cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn],
cg->fgfs[Gmx0->sgfn], cg->fgfs[Gmy0->sgfn], cg->fgfs[Gmz0->sgfn],

View File

@@ -51,7 +51,7 @@ public:
void Compute_Psi4(int lev);
void Step(int lev, int YN);
void AnalysisStuff_EScalar(int lev, double dT_lev);
void Interp_Constraint(bool infg);
void Interp_Constraint();
void Constraint_Out();
protected:

File diff suppressed because it is too large Load Diff

View File

@@ -31,19 +31,11 @@ using namespace std;
#include "surface_integral.h"
#include "checkpoint.h"
extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
#ifndef BSSN_USE_TRANSFER_CACHE
#define BSSN_USE_TRANSFER_CACHE 1
#endif
#ifndef BSSN_USE_ESCALAR_C_KERNEL
#define BSSN_USE_ESCALAR_C_KERNEL 1
#endif
class bssn_class
{
public:
extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
class bssn_class
{
public:
int ngfs;
int nprocs, myrank;
cgh *GH;
@@ -56,6 +48,7 @@ public:
double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut;
bool cuda_level0_constraint_cache_valid;
int *ConstraintRefreshLevels;
double Courant;
double numepss, numepsb, numepsh;
@@ -175,25 +168,14 @@ public:
void Setup_KerrSchild();
void Enforce_algcon(int lev, int fg);
void testRestrict();
void testOutBd();
bool check_Stdin_Abort();
bool use_transfer_cache() const;
void setup_transfer_caches();
void invalidate_transfer_caches();
void destroy_transfer_caches();
void sync_predictor_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
void sync_predictor_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
void sync_corrector_start(int lev, MyList<var> *VarList, Parallel::AsyncSyncState &async_state);
void sync_corrector_finish(int lev, Parallel::AsyncSyncState &async_state, MyList<var> *VarList);
void sync_evolution(int lev, MyList<var> *VarList, Parallel::SyncCache *cache_array = 0);
void restrict_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
void outbdlow2hi_evolution(int lev, MyList<var> *src_var_list, MyList<var> *dst_var_list);
virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto();
virtual void Initialize();
void testRestrict();
void testOutBd();
bool check_Stdin_Abort();
virtual void Setup_Initial_Data_Cao();
virtual void Setup_Initial_Data_Lousto();
virtual void Initialize();
virtual void Read_Ansorg();
virtual void Read_Pablo() {};
virtual void Compute_Psi4(int lev);

View File

@@ -1,169 +0,0 @@
#include "macrodef.h"
#include "bssn_rhs.h"
#include "share_func.h"
#include "tool.h"
#include <vector>
namespace
{
// Reuse the temporary workspace across block calls to avoid repeated heap churn
// in the EScalar wrapper. MPI ranks execute this path sequentially, so a single
// process-local buffer is sufficient here.
std::vector<double> g_escalar_tmp_store;
}
#ifdef fortran1
#define f_frpotential frpotential
#endif
#ifdef fortran2
#define f_frpotential FRPOTENTIAL
#endif
#ifdef fortran3
#define f_frpotential frpotential_
#endif
extern "C"
{
void f_frpotential(int *, double *, double *, double *);
}
int f_compute_rhs_bssn_escalar_c(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *Sphi, double *Spi,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *Sphi_rhs, double *Spi_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co)
{
const int nx = ex[0], ny = ex[1], nz = ex[2];
const int all = nx * ny * nz;
const size_t workspace_size = size_t(all) * 17;
if (g_escalar_tmp_store.size() < workspace_size)
g_escalar_tmp_store.resize(workspace_size);
double *tmp_ptr = g_escalar_tmp_store.data();
auto alloc_tmp = [&](int n = 1) -> double *
{
double *ptr = tmp_ptr;
tmp_ptr += size_t(all) * n;
return ptr;
};
double *chix = alloc_tmp(), *chiy = alloc_tmp(), *chiz = alloc_tmp();
double *Kx = alloc_tmp(), *Ky = alloc_tmp(), *Kz = alloc_tmp();
double *fxx = alloc_tmp(), *fxy = alloc_tmp(), *fxz = alloc_tmp();
double *fyy = alloc_tmp(), *fyz = alloc_tmp(), *fzz = alloc_tmp();
double *Lapx = alloc_tmp(), *Lapy = alloc_tmp(), *Lapz = alloc_tmp();
double *V = alloc_tmp(), *dVdSphi = alloc_tmp();
const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, HALF = 0.5;
const double SSS[3] = {1.0, 1.0, 1.0};
fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Sphi, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, Sphi, fxx, fxy, fxz, fyy, fyz, fzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
f_frpotential(ex, Sphi, V, dVdSphi);
for (int i = 0; i < all; ++i)
{
const double alpn1 = Lap[i] + ONE;
const double chin1 = chi[i] + ONE;
const double gxx = dxx[i] + ONE;
const double gyy = dyy[i] + ONE;
const double gzz = dzz[i] + ONE;
const double det = gxx * gyy * gzz + gxy[i] * gyz[i] * gxz[i] + gxz[i] * gxy[i] * gyz[i]
- gxz[i] * gyy * gxz[i] - gxy[i] * gxy[i] * gzz - gxx * gyz[i] * gyz[i];
const double gupxx = (gyy * gzz - gyz[i] * gyz[i]) / det;
const double gupxy = -(gxy[i] * gzz - gyz[i] * gxz[i]) / det;
const double gupxz = (gxy[i] * gyz[i] - gyy * gxz[i]) / det;
const double gupyy = (gxx * gzz - gxz[i] * gxz[i]) / det;
const double gupyz = -(gxx * gyz[i] - gxy[i] * gxz[i]) / det;
const double gupzz = (gxx * gyy - gxy[i] * gxy[i]) / det;
Sphi_rhs[i] = alpn1 * Spi[i];
Spi_rhs[i] = gupxx * fxx[i] + gupyy * fyy[i] + gupzz * fzz[i]
+ TWO * (gupxy * fxy[i] + gupxz * fxz[i] + gupyz * fyz[i])
- ((Gamx[i] + (gupxx * chix[i] + gupxy * chiy[i] + gupxz * chiz[i]) / TWO / chin1) * Kx[i]
+ (Gamy[i] + (gupxy * chix[i] + gupyy * chiy[i] + gupyz * chiz[i]) / TWO / chin1) * Ky[i]
+ (Gamz[i] + (gupxz * chix[i] + gupyz * chiy[i] + gupzz * chiz[i]) / TWO / chin1) * Kz[i]);
Spi_rhs[i] = Spi_rhs[i] * alpn1
+ gupxx * Lapx[i] * Kx[i] + gupxy * Lapx[i] * Ky[i] + gupxz * Lapx[i] * Kz[i]
+ gupxy * Lapy[i] * Kx[i] + gupyy * Lapy[i] * Ky[i] + gupyz * Lapy[i] * Kz[i]
+ gupxz * Lapz[i] * Kx[i] + gupyz * Lapz[i] * Ky[i] + gupzz * Lapz[i] * Kz[i];
Spi_rhs[i] = Spi_rhs[i] * chin1 + alpn1 * (trK[i] * Spi[i] - dVdSphi[i]);
rho[i] = chin1 * ((gupxx * Kx[i] * Kx[i] + gupyy * Ky[i] * Ky[i] + gupzz * Kz[i] * Kz[i]) * HALF
+ gupxy * Kx[i] * Ky[i] + gupxz * Kx[i] * Kz[i] + gupyz * Ky[i] * Kz[i])
+ Spi[i] * Spi[i] * HALF + V[i];
Sx[i] = -Spi[i] * Kx[i];
Sy[i] = -Spi[i] * Ky[i];
Sz[i] = -Spi[i] * Kz[i];
const double pressure = (rho[i] - Spi[i] * Spi[i]) / chin1;
Sxx[i] = Kx[i] * Kx[i] - pressure * gxx;
Sxy[i] = Kx[i] * Ky[i] - pressure * gxy[i];
Sxz[i] = Kx[i] * Kz[i] - pressure * gxz[i];
Syy[i] = Ky[i] * Ky[i] - pressure * gyy;
Syz[i] = Ky[i] * Kz[i] - pressure * gyz[i];
Szz[i] = Kz[i] * Kz[i] - pressure * gzz;
}
if (f_compute_rhs_bssn(ex, T, X, Y, Z,
chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz,
dtSfx, dtSfy, dtSfz,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs,
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
rho, Sx, Sy, Sz,
Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
ham_Res, movx_Res, movy_Res, movz_Res,
Gmx_Res, Gmy_Res, Gmz_Res,
Symmetry, Lev, eps, co))
return 1;
lopsided_kodis(ex, X, Y, Z, Sphi, Sphi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
lopsided_kodis(ex, X, Y, Z, Spi, Spi_rhs, betax, betay, betaz, Symmetry, SSS, eps);
for (int i = 0; i < all; ++i)
{
if (Sphi_rhs[i] != Sphi_rhs[i] || Spi_rhs[i] != Spi_rhs[i] || rho[i] != rho[i])
return 1;
}
return 0;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,73 +0,0 @@
#ifndef BSSN_GPU_H_
#define BSSN_GPU_H_
#include "bssn_macro.h"
#include "macrodef.fh"
#define DEVICE_ID 0
// #define DEVICE_ID_BY_MPI_RANK
#define GRID_DIM 256
#define BLOCK_DIM 128
#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
#define pow2(x) ((x) * (x))
#define TimeBetween(a, b) ((b.tv_sec - a.tv_sec) + (b.tv_usec - a.tv_usec) / 1000000.0f)
#define M_ metac.
#define Mh_ meta->
#define Ms_ metassc.
#define Msh_ metass->
// #define TIMING
#define RHS_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co
/** main function */
int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *rho, double *Sx, double *Sy, double *Sz, double *Sxx,
double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co);
int gpu_rhs_ss(RHS_SS_PARA);
/** Init GPU side data in GPUMeta. */
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,210 +0,0 @@
#ifndef BSSN_GPU_CLASS_H
#define BSSN_GPU_CLASS_H
#ifdef newc
#include <iostream>
#include <iomanip>
#include <fstream>
#include <cstdlib>
#include <string>
#include <cmath>
using namespace std;
#else
#include <iostream.h>
#include <iomanip.h>
#include <fstream.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#endif
#include <mpi.h>
#include "macrodef.h"
#include "cgh.h"
#include "ShellPatch.h"
#include "misc.h"
#include "var.h"
#include "MyList.h"
#include "monitor.h"
#include "surface_integral.h"
#include "checkpoint.h"
// added by yangquan
#include "bssn_macro.h"
extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
class bssn_class
{
public:
// added by yangquan
//----------------------
int gpu_num_mynode;
int cpu_core_num_mynode;
int mpi_process_num_mynode;
int my_sequence_mynode;
int mynode_id;
int use_gpu;
virtual void Step_GPU(int lev, int YN);
virtual void Get_runtime_envirment();
// virtual void Step_OPENMP(int lev,int YN);
//----------------------
int ngfs;
int nprocs, myrank;
cgh *GH;
ShellPatch *SH;
double PhysTime;
int checkrun;
char checkfilename[50];
int Steps;
double StartTime, TotalTime;
double AnasTime, DumpTime, d2DumpTime, CheckTime;
double LastAnas, LastConsOut;
double Courant;
double numepss, numepsb, numepsh;
int Symmetry;
int maxl, decn;
double maxrex, drex;
int trfls, a_lev;
double dT;
double chitiny;
double **Porg0, **Porgbr, **Porg, **Porg1, **Porg_rhs;
int BH_num, BH_num_input;
double *Mass, *Pmom, *Spin;
double ADMMass;
var *phio, *trKo;
var *gxxo, *gxyo, *gxzo, *gyyo, *gyzo, *gzzo;
var *Axxo, *Axyo, *Axzo, *Ayyo, *Ayzo, *Azzo;
var *Gmxo, *Gmyo, *Gmzo;
var *Lapo, *Sfxo, *Sfyo, *Sfzo;
var *dtSfxo, *dtSfyo, *dtSfzo;
var *phi0, *trK0;
var *gxx0, *gxy0, *gxz0, *gyy0, *gyz0, *gzz0;
var *Axx0, *Axy0, *Axz0, *Ayy0, *Ayz0, *Azz0;
var *Gmx0, *Gmy0, *Gmz0;
var *Lap0, *Sfx0, *Sfy0, *Sfz0;
var *dtSfx0, *dtSfy0, *dtSfz0;
var *phi, *trK;
var *gxx, *gxy, *gxz, *gyy, *gyz, *gzz;
var *Axx, *Axy, *Axz, *Ayy, *Ayz, *Azz;
var *Gmx, *Gmy, *Gmz;
var *Lap, *Sfx, *Sfy, *Sfz;
var *dtSfx, *dtSfy, *dtSfz;
var *phi1, *trK1;
var *gxx1, *gxy1, *gxz1, *gyy1, *gyz1, *gzz1;
var *Axx1, *Axy1, *Axz1, *Ayy1, *Ayz1, *Azz1;
var *Gmx1, *Gmy1, *Gmz1;
var *Lap1, *Sfx1, *Sfy1, *Sfz1;
var *dtSfx1, *dtSfy1, *dtSfz1;
var *phi_rhs, *trK_rhs;
var *gxx_rhs, *gxy_rhs, *gxz_rhs, *gyy_rhs, *gyz_rhs, *gzz_rhs;
var *Axx_rhs, *Axy_rhs, *Axz_rhs, *Ayy_rhs, *Ayz_rhs, *Azz_rhs;
var *Gmx_rhs, *Gmy_rhs, *Gmz_rhs;
var *Lap_rhs, *Sfx_rhs, *Sfy_rhs, *Sfz_rhs;
var *dtSfx_rhs, *dtSfy_rhs, *dtSfz_rhs;
var *rho, *Sx, *Sy, *Sz, *Sxx, *Sxy, *Sxz, *Syy, *Syz, *Szz;
var *Gamxxx, *Gamxxy, *Gamxxz, *Gamxyy, *Gamxyz, *Gamxzz;
var *Gamyxx, *Gamyxy, *Gamyxz, *Gamyyy, *Gamyyz, *Gamyzz;
var *Gamzxx, *Gamzxy, *Gamzxz, *Gamzyy, *Gamzyz, *Gamzzz;
var *Rxx, *Rxy, *Rxz, *Ryy, *Ryz, *Rzz;
var *Rpsi4, *Ipsi4;
var *t1Rpsi4, *t1Ipsi4, *t2Rpsi4, *t2Ipsi4;
var *Cons_Ham, *Cons_Px, *Cons_Py, *Cons_Pz, *Cons_Gx, *Cons_Gy, *Cons_Gz;
#ifdef Point_Psi4
var *phix, *phiy, *phiz;
var *trKx, *trKy, *trKz;
var *Axxx, *Axxy, *Axxz;
var *Axyx, *Axyy, *Axyz;
var *Axzx, *Axzy, *Axzz;
var *Ayyx, *Ayyy, *Ayyz;
var *Ayzx, *Ayzy, *Ayzz;
var *Azzx, *Azzy, *Azzz;
#endif
// FIXME: uc = StateList, up = OldStateList, upp = SynchList_cor; so never touch these three data
MyList<var> *StateList, *SynchList_pre, *SynchList_cor, *RHSList;
MyList<var> *OldStateList, *DumpList;
MyList<var> *ConstraintList;
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
monitor *ConVMonitor;
surface_integral *Waveshell;
checkpoint *CheckPoint;
public:
bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
int a_levi, int maxli, int decni, double maxrexi, double drexi);
~bssn_class();
void Evolve(int Steps);
void RecursiveStep(int lev);
#if (PSTR == 1)
void ParallelStep();
void SHStep();
#endif
void RestrictProlong(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
void RestrictProlong_aux(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
void RestrictProlong(int lev, int YN, bool BB);
void ProlongRestrict(int lev, int YN, bool BB);
void Setup_Black_Hole_position();
void compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int lev);
bool read_Pablo_file(int *ext, double *datain, char *filename);
void write_Pablo_file(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
char *filename);
void AnalysisStuff(int lev, double dT_lev);
void Setup_KerrSchild();
void Enforce_algcon(int lev, int fg);
void testRestrict();
void testOutBd();
virtual void Setup_Initial_Data_Lousto();
virtual void Setup_Initial_Data_Cao();
virtual void Initialize();
virtual void Read_Ansorg();
virtual void Read_Pablo() {};
virtual void Compute_Psi4(int lev);
virtual void Step(int lev, int YN);
virtual void Interp_Constraint(bool infg);
virtual void Constraint_Out();
virtual void Compute_Constraint();
#ifdef With_AHF
protected:
MyList<var> *AHList, *AHDList, *GaugeList;
int AHfindevery;
double AHdumptime;
int *lastahdumpid, HN_num; // number of possible horizons
int *findeveryl;
double *xc, *yc, *zc, *xr, *yr, *zr;
bool *trigger;
double *dTT;
int *dumpid;
public:
void AH_Prepare_derivatives();
bool AH_Interp_Points(MyList<var> *VarList,
int NN, double **XX,
double *Shellf, int Symmetryi);
void AH_Step_Find(int lev, double dT_lev);
#endif
};
#endif /* BSSN_GPU_CLASS_H */

View File

@@ -63,34 +63,13 @@ extern "C"
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Ricci
double *, double *, double *, double *, double *, double *, double *, // constraint violation
int &, int &, double &, int &);
}
int f_compute_rhs_bssn_escalar_c(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij
double *, double *, double *, // Gam
double *, double *, double *, double *, double *, double *, double *, // Gauge
double *, double *, // Sphi, Spi
double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij
double *, double *, double *, // Gam
double *, double *, double *, double *, double *, double *, double *, // Gauge
double *, double *, // Sphi, Spi
double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Ricci
double *, double *, double *, double *, double *, double *, double *, // constraint violation
int &, int &, double &, int &);
extern "C"
{
int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *, // ex,T,rho,sigma,R
double *, double *, double *, double *, double *, double *, double *, // constraint violation
int &, int &, double &, int &);
}
extern "C"
{
int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *, // ex,T,rho,sigma,R
double *, double *, double *, // X,Y,Z
double *, double *, double *, // drhodx,drhody,drhodz
double *, double *, double *, // dsigmadx,dsigmady,dsigmadz
@@ -117,10 +96,10 @@ extern "C"
int &, int &, double &, int &, int &);
}
extern "C"
{
int f_compute_rhs_bssn_escalar(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK
extern "C"
{
int f_compute_rhs_bssn_escalar(int *, double &, double *, double *, double *, // ex,T,X,Y,Z
double *, double *, // chi, trK
double *, double *, double *, double *, double *, double *, // gij
double *, double *, double *, double *, double *, double *, // Aij
double *, double *, double *, // Gam
@@ -137,14 +116,14 @@ extern "C"
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Christoffel
double *, double *, double *, double *, double *, double *, // Ricci
double *, double *, double *, double *, double *, double *, double *, // constraint violation
int &, int &, double &, int &);
}
extern "C"
{
int f_compute_rhs_bssn_escalar_ss(int *, double &, double *, double *, double *, // ex,T,rho,sigma,R
double *, double *, double *, // X,Y,Z
double *, double *, double *, double *, double *, double *, double *, // constraint violation
int &, int &, double &, int &);
}
extern "C"
{
int f_compute_rhs_bssn_escalar_ss(int *, double &, double *, double *, double *, // ex,T,rho,sigma,R
double *, double *, double *, // X,Y,Z
double *, double *, double *, // drhodx,drhody,drhodz
double *, double *, double *, // dsigmadx,dsigmady,dsigmadz
double *, double *, double *, // dRdx,dRdy,dRdz

View File

@@ -1098,12 +1098,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
betaz_rhs[i] = FF * dtSfz[i];
reta[i] =
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
gupxx[i] * chix[i] * chix[i]
+ gupyy[i] * chiy[i] * chiy[i]
+ gupzz[i] * chiz[i] * chiz[i]
+ TWO * ( gupxy[i] * chix[i] * chiy[i]
+ gupxz[i] * chix[i] * chiz[i]
+ gupyz[i] * chiy[i] * chiz[i] );
#if (GAUGE == 2)
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
@@ -1116,12 +1116,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
#elif (GAUGE == 4 || GAUGE == 5)
reta[i] =
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
gupxx[i] * chix[i] * chix[i]
+ gupyy[i] * chiy[i] * chiy[i]
+ gupzz[i] * chiz[i] * chiz[i]
+ TWO * ( gupxy[i] * chix[i] * chiy[i]
+ gupxz[i] * chix[i] * chiz[i]
+ gupyz[i] * chiy[i] * chiz[i] );
#if (GAUGE == 4)
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,127 @@
#ifndef BSSN_RHS_CUDA_H
#define BSSN_RHS_CUDA_H
#ifdef __cplusplus
extern "C" {
#endif
enum {
BSSN_CUDA_STATE_COUNT = 24,
BSSN_CUDA_MATTER_COUNT = 10
};
int f_compute_rhs_bssn(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
int &Symmetry, int &Lev, double &eps, int &co);
int bssn_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
double **matter_host,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &use_zero_matter,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int bssn_cuda_copy_state_region_to_host(void *block_tag,
int state_index,
double *host_state,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_copy_state_region_from_host(void *block_tag,
int state_index,
double *host_state,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int bssn_cuda_download_constraint_outputs(int *ex,
double **constraint_host_out);
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int bssn_cuda_download_state_subset(void *block_tag,
int *ex,
int subset_count,
const int *state_indices,
double **state_host_out);
int bssn_cuda_upload_state_subset(void *block_tag,
int *ex,
int subset_count,
const int *state_indices,
double **state_host_in);
int bssn_cuda_has_resident_state(void *block_tag);
void bssn_cuda_release_step_ctx(void *block_tag);
#ifdef __cplusplus
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,69 +1,53 @@
include makefile.inc
-include AMSS_NCKU_build.mk
ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
ifeq ($(USE_TRANSFER_CACHE),auto)
ifeq ($(ABE_TYPE),0)
EFFECTIVE_USE_TRANSFER_CACHE = 1
else
EFFECTIVE_USE_TRANSFER_CACHE = 0
endif
else
EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
endif
ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
ifeq ($(ABE_TYPE),1)
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
ifeq ($(USE_CXX_KERNELS),0)
$(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar_rhs_c.C reuses the C BSSN kernel)
endif
endif
## polint(ordn=6) kernel selector:
## 1 (default): barycentric fast path
## 0 : fallback to Neville path
POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
## make -> opt (PGO-guided, maximum performance)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
include makefile.inc
## polint(ordn=6) kernel selector:
## 1 (default): barycentric fast path
## 0 : fallback to Neville path
POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
## make -> opt (PGO-guided, maximum performance)
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
ifeq ($(TOOLCHAIN),intel)
OMP_FLAG = -qopenmp
ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
endif
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
endif
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc $(MKL_INC)
else
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
## PGO_MODE is ignored in this branch.
OMP_FLAG = -mp
CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
$(MKL_INC) $(POLINT6_FLAG)
TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
-Dfortran3 -Dnewc $(MKL_INC)
endif
.SUFFIXES: .o .f90 .C .for .cu
@@ -79,12 +63,20 @@ endif
.cu.o:
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# CUDA rewrite of Z4C Cartesian RHS
z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
# C rewrite of BSSN RHS kernel and helpers
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
bssn_rhs_c.o: bssn_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fderivs_c.o: fderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
fdderivs_c.o: fdderivs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -92,48 +84,83 @@ fdderivs_c.o: fdderivs_c.C
kodiss_c.o: kodiss_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_c.o: lopsided_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
lopsided_c.o: lopsided_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-fprofile-instr-use=$(TP_PROFDATA) \
-Dfortran3 -Dnewc -I${MKLROOT}/include
lopsided_kodis_c.o: lopsided_kodis_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
z4c_rhs_c.o: z4c_rhs_c.C
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
TwoPunctures.o: TwoPunctures.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
TwoPunctureABE.o: TwoPunctureABE.C
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
# Input files
## CUDA BSSN RHS switch
## 1 : use the rewritten CUDA bssn_rhs backend
## 0 : keep the normal CPU/Fortran selection below
USE_CUDA_BSSN ?= 0
USE_CUDA_Z4C ?= 0
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
CFILES =
else
# C++ mode (default): C rewrite of bssn/bssn-escalar rhs and helper kernels
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
CFILES += bssn_escalar_rhs_c.o
endif
endif
## RK4 kernel switch (independent from USE_CXX_KERNELS)
ifeq ($(USE_CXX_RK4),1)
CFILES += rungekutta4_rout_c.o
RK4_F90_OBJ =
else
RK4_F90_OBJ = rungekutta4_rout.o
endif
ifeq ($(USE_CXX_KERNELS),0)
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
CFILES_CPU =
else
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
endif
CFILES_CUDA_BSSN = bssn_rhs_cuda.o
ifeq ($(USE_CUDA_BSSN),1)
CFILES = $(CFILES_CUDA_BSSN)
else
CFILES = $(CFILES_CPU)
endif
ifeq ($(USE_CUDA_Z4C),1)
CFILES += z4c_rhs_cuda.o
Z4C_F90_OBJ =
else ifeq ($(USE_CXX_Z4C_KERNELS),1)
CFILES += z4c_rhs_c.o
Z4C_F90_OBJ =
else
Z4C_F90_OBJ = Z4c_rhs.o
endif
## RK4 kernel switch (independent from USE_CXX_KERNELS)
ifeq ($(USE_CXX_RK4),1)
RK4_C_OBJ = rungekutta4_rout_c.o
RK4_F90_OBJ =
else
RK4_C_OBJ =
RK4_F90_OBJ = rungekutta4_rout.o
endif
CFILES += $(RK4_C_OBJ)
ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) z4c_rhs_cuda.o $(RK4_C_OBJ)
ABE_LDLIBS = $(LDLIBS)
ifeq ($(USE_CUDA_BSSN),1)
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
endif
ifeq ($(USE_CUDA_Z4C),1)
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
endif
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
@@ -142,7 +169,7 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
#C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
cgh.o surface_integral.o ShellPatch.o\
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
@@ -150,13 +177,13 @@ C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o
NullShellPatch2_Evo.o \
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
prolongrestrict_cell.o prolongrestrict_vertex.o\
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\
cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
getnpem2.o empart.o NullNews.o fourdcurvature.o\
bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
@@ -180,10 +207,10 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
#CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
# file dependences
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
@@ -194,7 +221,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
initial_null2.h NullShellPatch2.h
$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
#$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\
cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
@@ -206,7 +233,7 @@ $(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
TwoPunctureFILES: TwoPunctures.h
@@ -216,13 +243,18 @@ misc.o : zbesh.o
# projects
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
ABE_CUDA: USE_CUDA_BSSN=1
ABE_CUDA: USE_CUDA_Z4C=1
ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
TwoPunctureABE: $(TwoPunctureFILES)
$(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS)
$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)
clean:
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f

View File

@@ -1,28 +1,7 @@
## GCC version (commented out)
## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
## Intel oneAPI version with oneMKL (Optimized for performance)
filein = -I/usr/include/ -I${MKLROOT}/include
## Using sequential MKL (OpenMP disabled for better single-threaded performance)
## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
## Toolchain selection
## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default)
## intel : Intel oneAPI toolchain (legacy path)
TOOLCHAIN ?= nvhpc
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
## opt : (default) maximum performance with PGO profile-guided optimization
@@ -43,35 +22,70 @@ else
INTERP_LB_FLAGS =
endif
MKLROOT ?= /home/intel/oneapi/mkl/latest
MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
MKL_INC ?= -I$(MKLROOT)/include
NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
CUDA_HOME ?= $(NVHPC_ROOT)/cuda
CUDA_ARCH ?= sm_80
## Kernel implementation switch
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
## 0 : fall back to original Fortran kernels
USE_CXX_KERNELS ?= 1
## BSSN-EScalar RHS switch
## 1 (default) : use BSSN-EScalar C wrapper on the normal patch path
## 0 : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
USE_CXX_ESCALAR_KERNEL ?= 1
## Cached transfer switch
## auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
## 1 : force cached Sync/Restrict/OutBd transfer on evolution hot paths
## 0 : force the original uncached transfer path
USE_TRANSFER_CACHE ?= auto
## Z4C Cartesian RHS kernel switch
## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
## 0 : use original Fortran Z4c_rhs.o
USE_CXX_Z4C_KERNELS ?= 1
## RK4 kernel implementation switch
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
## 0 : use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 1
## Memory allocator switch
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
## 0 : use system default allocator (ptmalloc)
USE_TBBMALLOC ?= 1
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
ifneq ($(wildcard $(TBBMALLOC_SO)),)
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
else
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
endif
ifeq ($(TOOLCHAIN),intel)
f90 = ifx
f77 = ifx
CXX = icpx
CC = icx
CLINKER = mpiicpx
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lifcore -limf -liomp5 -lpthread -lm -ldl \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
else ifeq ($(TOOLCHAIN),nvhpc)
f90 = mpifort
f77 = mpifort
CXX = mpicxx
CC = mpicc
CLINKER = mpicxx
Cu = nvcc
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
-lpthread -lm -ldl \
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
-fortranlibs
endif
ifeq ($(USE_TBBMALLOC),1)
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
endif
Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)

View File

@@ -0,0 +1,725 @@
#include "macrodef.h"
#include "bssn_rhs.h"
#include "fmisc.h"
#include "ricci_gamma.h"
#include "share_func.h"
#include "tool.h"
#include <vector>
#ifdef fortran1
#define f_constraint_bssn constraint_bssn
#define f_z4c_rhs_point z4c_rhs_point
#endif
#ifdef fortran2
#define f_constraint_bssn CONSTRAINT_BSSN
#define f_z4c_rhs_point Z4C_RHS_POINT
#endif
#ifdef fortran3
#define f_constraint_bssn constraint_bssn_
#define f_z4c_rhs_point z4c_rhs_point_
#endif
extern "C" void f_constraint_bssn(int *, double *, double *, double *,
double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *,
double *, double *, double *, double *, double *, double *, double *, double *,
double *, double *, double *,
int &);
extern "C" void f_z4c_rhs_point(
double &A11,
double &A12,
double &A13,
double &A22,
double &A23,
double &A33,
double &alpha,
double &B1,
double &B2,
double &B3,
double &beta1,
double &beta2,
double &beta3,
double &chi,
double &chiDivFloor,
double &da1,
double &dA111,
double &dA112,
double &dA113,
double &dA122,
double &dA123,
double &dA133,
double &da2,
double &dA211,
double &dA212,
double &dA213,
double &dA222,
double &dA223,
double &dA233,
double &da3,
double &dA311,
double &dA312,
double &dA313,
double &dA322,
double &dA323,
double &dA333,
double &db11,
double &dB11,
double &db12,
double &dB12,
double &db13,
double &dB13,
double &db21,
double &dB21,
double &db22,
double &dB22,
double &db23,
double &dB23,
double &db31,
double &dB31,
double &db32,
double &dB32,
double &db33,
double &dB33,
double &dchi1,
double &dchi2,
double &dchi3,
double &dda11,
double &dda12,
double &dda13,
double &dda22,
double &dda23,
double &dda33,
double &ddb111,
double &ddb112,
double &ddb113,
double &ddb121,
double &ddb122,
double &ddb123,
double &ddb131,
double &ddb132,
double &ddb133,
double &ddb221,
double &ddb222,
double &ddb223,
double &ddb231,
double &ddb232,
double &ddb233,
double &ddb331,
double &ddb332,
double &ddb333,
double &ddchi11,
double &ddchi12,
double &ddchi13,
double &ddchi22,
double &ddchi23,
double &ddchi33,
double &deldelg1111,
double &deldelg1112,
double &deldelg1113,
double &deldelg1122,
double &deldelg1123,
double &deldelg1133,
double &deldelg1211,
double &deldelg1212,
double &deldelg1213,
double &deldelg1222,
double &deldelg1223,
double &deldelg1233,
double &deldelg1311,
double &deldelg1312,
double &deldelg1313,
double &deldelg1322,
double &deldelg1323,
double &deldelg1333,
double &deldelg2211,
double &deldelg2212,
double &deldelg2213,
double &deldelg2222,
double &deldelg2223,
double &deldelg2233,
double &deldelg2311,
double &deldelg2312,
double &deldelg2313,
double &deldelg2322,
double &deldelg2323,
double &deldelg2333,
double &deldelg3311,
double &deldelg3312,
double &deldelg3313,
double &deldelg3322,
double &deldelg3323,
double &deldelg3333,
double &delG11,
double &delg111,
double &delg112,
double &delg113,
double &delG12,
double &delg122,
double &delg123,
double &delG13,
double &delg133,
double &delG21,
double &delg211,
double &delg212,
double &delg213,
double &delG22,
double &delg222,
double &delg223,
double &delG23,
double &delg233,
double &delG31,
double &delg311,
double &delg312,
double &delg313,
double &delG32,
double &delg322,
double &delg323,
double &delG33,
double &delg333,
double &dKhat1,
double &dKhat2,
double &dKhat3,
double &dTheta1,
double &dTheta2,
double &dTheta3,
double &G1,
double &g11,
double &g12,
double &g13,
double &G2,
double &g22,
double &g23,
double &G3,
double &g33,
double &kappa1,
double &kappa2,
double &Khat,
double &rA11,
double &rA12,
double &rA13,
double &rA22,
double &rA23,
double &rA33,
double &rchi,
double &rG1,
double &rg11,
double &rg12,
double &rg13,
double &rG2,
double &rg22,
double &rg23,
double &rG3,
double &rg33,
double &rKhat,
double &rTheta,
double &Theta);
static inline void z4c_contract_gamma(
const double gxx, const double gxy, const double gxz,
const double gyy, const double gyz, const double gzz,
const double gxxx, const double gxyx, const double gxzx,
const double gyyx, const double gyzx, const double gzzx,
const double gxxy, const double gxyy, const double gxzy,
const double gyyy, const double gyzy, const double gzzy,
const double gxxz, const double gxyz, const double gxzz,
const double gyyz, const double gyzz, const double gzzz,
double &Gamxa, double &Gamya, double &Gamza)
{
double det = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz -
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz;
const double gupxx = (gyy * gzz - gyz * gyz) / det;
const double gupxy = -(gxy * gzz - gyz * gxz) / det;
const double gupxz = (gxy * gyz - gyy * gxz) / det;
const double gupyy = (gxx * gzz - gxz * gxz) / det;
const double gupyz = -(gxx * gyz - gxy * gxz) / det;
const double gupzz = (gxx * gyy - gxy * gxy) / det;
const double Gamxxx = 0.5 * (gupxx * gxxx + gupxy * (2.0 * gxyx - gxxy) + gupxz * (2.0 * gxzx - gxxz));
const double Gamyxx = 0.5 * (gupxy * gxxx + gupyy * (2.0 * gxyx - gxxy) + gupyz * (2.0 * gxzx - gxxz));
const double Gamzxx = 0.5 * (gupxz * gxxx + gupyz * (2.0 * gxyx - gxxy) + gupzz * (2.0 * gxzx - gxxz));
const double Gamxyy = 0.5 * (gupxx * (2.0 * gxyy - gyyx) + gupxy * gyyy + gupxz * (2.0 * gyzy - gyyz));
const double Gamyyy = 0.5 * (gupxy * (2.0 * gxyy - gyyx) + gupyy * gyyy + gupyz * (2.0 * gyzy - gyyz));
const double Gamzyy = 0.5 * (gupxz * (2.0 * gxyy - gyyx) + gupyz * gyyy + gupzz * (2.0 * gyzy - gyyz));
const double Gamxzz = 0.5 * (gupxx * (2.0 * gxzz - gzzx) + gupxy * (2.0 * gyzz - gzzy) + gupxz * gzzz);
const double Gamyzz = 0.5 * (gupxy * (2.0 * gxzz - gzzx) + gupyy * (2.0 * gyzz - gzzy) + gupyz * gzzz);
const double Gamzzz = 0.5 * (gupxz * (2.0 * gxzz - gzzx) + gupyz * (2.0 * gyzz - gzzy) + gupzz * gzzz);
const double Gamxxy = 0.5 * (gupxx * gxxy + gupxy * gyyx + gupxz * (gxzy + gyzx - gxyz));
const double Gamyxy = 0.5 * (gupxy * gxxy + gupyy * gyyx + gupyz * (gxzy + gyzx - gxyz));
const double Gamzxy = 0.5 * (gupxz * gxxy + gupyz * gyyx + gupzz * (gxzy + gyzx - gxyz));
const double Gamxxz = 0.5 * (gupxx * gxxz + gupxy * (gxyz + gyzx - gxzy) + gupxz * gzzx);
const double Gamyxz = 0.5 * (gupxy * gxxz + gupyy * (gxyz + gyzx - gxzy) + gupyz * gzzx);
const double Gamzxz = 0.5 * (gupxz * gxxz + gupyz * (gxyz + gyzx - gxzy) + gupzz * gzzx);
const double Gamxyz = 0.5 * (gupxx * (gxyz + gxzy - gyzx) + gupxy * gyyz + gupxz * gzzy);
const double Gamyyz = 0.5 * (gupxy * (gxyz + gxzy - gyzx) + gupyy * gyyz + gupyz * gzzy);
const double Gamzyz = 0.5 * (gupxz * (gxyz + gxzy - gyzx) + gupyz * gyyz + gupzz * gzzy);
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz +
2.0 * (gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz);
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz +
2.0 * (gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz);
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz +
2.0 * (gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz);
}
static int compute_rhs_z4c_cartesian(
int *ex, double &T, double *X, double *Y, double *Z,
double *chi_state, double *chi_constraints, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *TZ,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *TZ_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
int &Symmetry, int &Lev, double &eps, int &co)
{
(void)T;
const int nx = ex[0];
const int ny = ex[1];
const int nz = ex[2];
const int all = nx * ny * nz;
double alpn1[all], chin1[all], gxx[all], gyy[all], gzz[all];
double chix[all], chiy[all], chiz[all], chixx[all], chixy[all], chixz[all], chiyy[all], chiyz[all], chizz[all];
double gxxx[all], gxyx[all], gxzx[all], gyyx[all], gyzx[all], gzzx[all];
double gxxy[all], gxyy[all], gxzy[all], gyyy[all], gyzy[all], gzzy[all];
double gxxz[all], gxyz[all], gxzz[all], gyyz[all], gyzz[all], gzzz[all];
double gxxxx[all], gxxxy[all], gxxxz[all], gxxyy[all], gxxyz[all], gxxzz[all];
double gxyxx[all], gxyxy[all], gxyxz[all], gxyyy[all], gxyyz[all], gxyzz[all];
double gxzxx[all], gxzxy[all], gxzxz[all], gxzyy[all], gxzyz[all], gxzzz[all];
double gyyxx[all], gyyxy[all], gyyxz[all], gyyyy[all], gyyyz[all], gyyzz[all];
double gyzxx[all], gyzxy[all], gyzxz[all], gyzyy[all], gyzyz[all], gyzzz[all];
double gzzxx[all], gzzxy[all], gzzxz[all], gzzyy[all], gzzyz[all], gzzzz[all];
double Lapx[all], Lapy[all], Lapz[all], Lapxx[all], Lapxy[all], Lapxz[all], Lapyy[all], Lapyz[all], Lapzz[all];
double betaxx[all], betaxy[all], betaxz[all], betayx[all], betayy[all], betayz[all], betazx[all], betazy[all], betazz[all];
double dBxx[all], dBxy[all], dBxz[all], dByx[all], dByy[all], dByz[all], dBzx[all], dBzy[all], dBzz[all];
double sfxxx[all], sfxxy[all], sfxxz[all], sfxyy[all], sfxyz[all], sfxzz[all];
double sfyxx[all], sfyxy[all], sfyxz[all], sfyyy[all], sfyyz[all], sfyzz[all];
double sfzxx[all], sfzxy[all], sfzxz[all], sfzyy[all], sfzyz[all], sfzzz[all];
double Gamxx[all], Gamxy[all], Gamxz[all], Gamyx[all], Gamyy[all], Gamyz[all], Gamzx[all], Gamzy[all], Gamzz[all];
double Kx[all], Ky[all], Kz[all], TZx[all], TZy[all], TZz[all];
double Axxx[all], Axxy[all], Axxz[all], Axyx[all], Axyy[all], Axyz[all];
double Axzx[all], Axzy[all], Axzz[all], Ayyx[all], Ayyy[all], Ayyz[all];
double Ayzx[all], Ayzy[all], Ayzz[all], Azzx[all], Azzy[all], Azzz[all];
const double SSS[3] = {1.0, 1.0, 1.0};
const double AAS[3] = {-1.0, -1.0, 1.0};
const double ASA[3] = {-1.0, 1.0, -1.0};
const double SAA[3] = {1.0, -1.0, -1.0};
const double ASS[3] = {-1.0, 1.0, 1.0};
const double SAS[3] = {1.0, -1.0, 1.0};
const double SSA[3] = {1.0, 1.0, -1.0};
const double ONE = 1.0;
const double TWO = 2.0;
const double ZEO = 0.0;
double chiDivfloor = 1.0e-5;
double kappa1 = 2.0e-2;
double kappa2 = 0.0;
double FF = 0.75;
double eta = 2.0;
for (int idx = 0; idx < all; ++idx)
{
alpn1[idx] = Lap[idx] + ONE;
chin1[idx] = chi_state[idx] + ONE;
gxx[idx] = dxx[idx] + ONE;
gyy[idx] = dyy[idx] + ONE;
gzz[idx] = dzz[idx] + ONE;
}
fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, dtSfx, dBxx, dBxy, dBxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, dtSfy, dByx, dByy, dByz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, dtSfz, dBzx, dBzy, dBzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, chi_state, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, dxx, gxxxx, gxxxy, gxxxz, gxxyy, gxxyz, gxxzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, dyy, gyyxx, gyyxy, gyyxz, gyyyy, gyyyz, gyyzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, dzz, gzzxx, gzzxy, gzzxz, gzzyy, gzzyz, gzzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, gxy, gxyxx, gxyxy, gxyxz, gxyyy, gxyyz, gxyzz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
fdderivs(ex, gxz, gxzxx, gxzxy, gxzxz, gxzyy, gxzyz, gxzzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
fdderivs(ex, gyz, gyzxx, gyzxy, gyzxz, gyzyy, gyzyz, gyzzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
fderivs(ex, Gamx, Gamxx, Gamxy, Gamxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Gamy, Gamyx, Gamyy, Gamyz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, Gamz, Gamzx, Gamzy, Gamzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, trK, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, TZ, TZx, TZy, TZz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, betax, sfxxx, sfxxy, sfxxz, sfxyy, sfxyz, sfxzz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, betay, sfyxx, sfyxy, sfyxz, sfyyy, sfyyz, sfyzz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
fdderivs(ex, betaz, sfzxx, sfzxy, sfzxz, sfzyy, sfzyz, sfzzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
fdderivs(ex, chi_state, chixx, chixy, chixz, chiyy, chiyz, chizz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fdderivs(ex, Lap, Lapxx, Lapxy, Lapxz, Lapyy, Lapyz, Lapzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Axx, Axxx, Axxy, Axxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Axy, Axyx, Axyy, Axyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
fderivs(ex, Axz, Axzx, Axzy, Axzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
fderivs(ex, Ayy, Ayyx, Ayyy, Ayyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
fderivs(ex, Ayz, Ayzx, Ayzy, Ayzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
fderivs(ex, Azz, Azzx, Azzy, Azzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
for (int idx = 0; idx < all; ++idx)
{
double point_kappa1 = 0.0;
f_z4c_rhs_point(
Axx[idx], Axy[idx], Axz[idx], Ayy[idx], Ayz[idx], Azz[idx],
alpn1[idx], dtSfx[idx], dtSfy[idx], dtSfz[idx],
betax[idx], betay[idx], betaz[idx],
chin1[idx], chiDivfloor,
Lapx[idx],
Axxx[idx], Axyx[idx], Axzx[idx], Ayyx[idx], Ayzx[idx], Azzx[idx],
Lapy[idx],
Axxy[idx], Axyy[idx], Axzy[idx], Ayyy[idx], Ayzy[idx], Azzy[idx],
Lapz[idx],
Axxz[idx], Axyz[idx], Axzz[idx], Ayyz[idx], Ayzz[idx], Azzz[idx],
betaxx[idx], dBxx[idx], betayx[idx], dByx[idx], betazx[idx], dBzx[idx],
betaxy[idx], dBxy[idx], betayy[idx], dByy[idx], betazy[idx], dBzy[idx],
betaxz[idx], dBxz[idx], betayz[idx], dByz[idx], betazz[idx], dBzz[idx],
chix[idx], chiy[idx], chiz[idx],
Lapxx[idx], Lapxy[idx], Lapxz[idx], Lapyy[idx], Lapyz[idx], Lapzz[idx],
sfxxx[idx], sfyxx[idx], sfzxx[idx],
sfxxy[idx], sfyxy[idx], sfzxy[idx],
sfxxz[idx], sfyxz[idx], sfzxz[idx],
sfxyy[idx], sfyyy[idx], sfzyy[idx],
sfxyz[idx], sfyyz[idx], sfzyz[idx],
sfxzz[idx], sfyzz[idx], sfzzz[idx],
chixx[idx], chixy[idx], chixz[idx], chiyy[idx], chiyz[idx], chizz[idx],
gxxxx[idx], gxyxx[idx], gxzxx[idx], gyyxx[idx], gyzxx[idx], gzzxx[idx],
gxxxy[idx], gxyxy[idx], gxzxy[idx], gyyxy[idx], gyzxy[idx], gzzxy[idx],
gxxxz[idx], gxyxz[idx], gxzxz[idx], gyyxz[idx], gyzxz[idx], gzzxz[idx],
gxxyy[idx], gxyyy[idx], gxzyy[idx], gyyyy[idx], gyzyy[idx], gzzyy[idx],
gxxyz[idx], gxyyz[idx], gxzyz[idx], gyyyz[idx], gyzyz[idx], gzzyz[idx],
gxxzz[idx], gxyzz[idx], gxzzz[idx], gyyzz[idx], gyzzz[idx], gzzzz[idx],
Gamxx[idx], gxxx[idx], gxyx[idx], gxzx[idx],
Gamyx[idx], gyyx[idx], gyzx[idx],
Gamzx[idx], gzzx[idx],
Gamxy[idx], gxxy[idx], gxyy[idx], gxzy[idx],
Gamyy[idx], gyyy[idx], gyzy[idx],
Gamzy[idx], gzzy[idx],
Gamxz[idx], gxxz[idx], gxyz[idx], gxzz[idx],
Gamyz[idx], gyyz[idx], gyzz[idx],
Gamzz[idx], gzzz[idx],
Kx[idx], Ky[idx], Kz[idx],
TZx[idx], TZy[idx], TZz[idx],
Gamx[idx], gxx[idx], gxy[idx], gxz[idx],
Gamy[idx], gyy[idx], gyz[idx],
Gamz[idx], gzz[idx],
point_kappa1, kappa2,
trK[idx],
Axx_rhs[idx], Axy_rhs[idx], Axz_rhs[idx], Ayy_rhs[idx], Ayz_rhs[idx], Azz_rhs[idx],
chi_rhs[idx],
Gamx_rhs[idx], gxx_rhs[idx], gxy_rhs[idx], gxz_rhs[idx],
Gamy_rhs[idx], gyy_rhs[idx], gyz_rhs[idx],
Gamz_rhs[idx], gzz_rhs[idx], trK_rhs[idx], TZ_rhs[idx], TZ[idx]);
}
for (int idx = 0; idx < all; ++idx)
Lap_rhs[idx] = -TWO * alpn1[idx] * trK[idx];
#if (GAUGE == 0)
for (int idx = 0; idx < all; ++idx)
{
betax_rhs[idx] = FF * dtSfx[idx];
betay_rhs[idx] = FF * dtSfy[idx];
betaz_rhs[idx] = FF * dtSfz[idx];
dtSfx_rhs[idx] = Gamx_rhs[idx] - eta * dtSfx[idx];
dtSfy_rhs[idx] = Gamy_rhs[idx] - eta * dtSfy[idx];
dtSfz_rhs[idx] = Gamz_rhs[idx] - eta * dtSfz[idx];
}
#elif (GAUGE == 1)
for (int idx = 0; idx < all; ++idx)
{
betax_rhs[idx] = Gamx[idx] - eta * betax[idx];
betay_rhs[idx] = Gamy[idx] - eta * betay[idx];
betaz_rhs[idx] = Gamz[idx] - eta * betaz[idx];
dtSfx_rhs[idx] = ZEO;
dtSfy_rhs[idx] = ZEO;
dtSfz_rhs[idx] = ZEO;
}
#else
#error "z4c_rhs_c.C currently supports GAUGE == 0 or GAUGE == 1 for Z4C"
#endif
lopsided(ex, X, Y, Z, gxx, gxx_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, gxy, gxy_rhs, betax, betay, betaz, Symmetry, AAS);
lopsided(ex, X, Y, Z, gxz, gxz_rhs, betax, betay, betaz, Symmetry, ASA);
lopsided(ex, X, Y, Z, gyy, gyy_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, gyz, gyz_rhs, betax, betay, betaz, Symmetry, SAA);
lopsided(ex, X, Y, Z, gzz, gzz_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Axx, Axx_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Axy, Axy_rhs, betax, betay, betaz, Symmetry, AAS);
lopsided(ex, X, Y, Z, Axz, Axz_rhs, betax, betay, betaz, Symmetry, ASA);
lopsided(ex, X, Y, Z, Ayy, Ayy_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Ayz, Ayz_rhs, betax, betay, betaz, Symmetry, SAA);
lopsided(ex, X, Y, Z, Azz, Azz_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, chi_state, chi_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, trK, trK_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, Gamx, Gamx_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, Gamy, Gamy_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, Gamz, Gamz_rhs, betax, betay, betaz, Symmetry, SSA);
lopsided(ex, X, Y, Z, Lap, Lap_rhs, betax, betay, betaz, Symmetry, SSS);
lopsided(ex, X, Y, Z, betax, betax_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, betay, betay_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, betaz, betaz_rhs, betax, betay, betaz, Symmetry, SSA);
#if (GAUGE == 0)
lopsided(ex, X, Y, Z, dtSfx, dtSfx_rhs, betax, betay, betaz, Symmetry, ASS);
lopsided(ex, X, Y, Z, dtSfy, dtSfy_rhs, betax, betay, betaz, Symmetry, SAS);
lopsided(ex, X, Y, Z, dtSfz, dtSfz_rhs, betax, betay, betaz, Symmetry, SSA);
#endif
lopsided(ex, X, Y, Z, TZ, TZ_rhs, betax, betay, betaz, Symmetry, SSS);
for (int idx = 0; idx < all; ++idx)
{
double Gamxa = 0.0, Gamya = 0.0, Gamza = 0.0;
z4c_contract_gamma(
gxx[idx], gxy[idx], gxz[idx], gyy[idx], gyz[idx], gzz[idx],
gxxx[idx], gxyx[idx], gxzx[idx], gyyx[idx], gyzx[idx], gzzx[idx],
gxxy[idx], gxyy[idx], gxzy[idx], gyyy[idx], gyzy[idx], gzzy[idx],
gxxz[idx], gxyz[idx], gxzz[idx], gyyz[idx], gyzz[idx], gzzz[idx],
Gamxa, Gamya, Gamza);
TZ_rhs[idx] -= alpn1[idx] * (TWO + kappa2) * kappa1 * TZ[idx];
trK_rhs[idx] += alpn1[idx] * kappa1 * (ONE - kappa2) * TZ[idx];
Gamx_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamx[idx] - Gamxa);
Gamy_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamy[idx] - Gamya);
Gamz_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamz[idx] - Gamza);
}
if (eps > 0.0)
{
kodis(ex, X, Y, Z, chi_state, chi_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, trK, trK_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, gxx, gxx_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, gxy, gxy_rhs, AAS, Symmetry, eps);
kodis(ex, X, Y, Z, gxz, gxz_rhs, ASA, Symmetry, eps);
kodis(ex, X, Y, Z, gyy, gyy_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, gyz, gyz_rhs, SAA, Symmetry, eps);
kodis(ex, X, Y, Z, gzz, gzz_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Axx, Axx_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Axy, Axy_rhs, AAS, Symmetry, eps);
kodis(ex, X, Y, Z, Axz, Axz_rhs, ASA, Symmetry, eps);
kodis(ex, X, Y, Z, Ayy, Ayy_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Ayz, Ayz_rhs, SAA, Symmetry, eps);
kodis(ex, X, Y, Z, Azz, Azz_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, Gamx, Gamx_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, Gamy, Gamy_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, Gamz, Gamz_rhs, SSA, Symmetry, eps);
kodis(ex, X, Y, Z, Lap, Lap_rhs, SSS, Symmetry, eps);
kodis(ex, X, Y, Z, betax, betax_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, betay, betay_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, betaz, betaz_rhs, SSA, Symmetry, eps);
#if (GAUGE == 0)
kodis(ex, X, Y, Z, dtSfx, dtSfx_rhs, ASS, Symmetry, eps);
kodis(ex, X, Y, Z, dtSfy, dtSfy_rhs, SAS, Symmetry, eps);
kodis(ex, X, Y, Z, dtSfz, dtSfz_rhs, SSA, Symmetry, eps);
#endif
kodis(ex, X, Y, Z, TZ, TZ_rhs, SSS, Symmetry, eps);
}
if (co == 0)
{
#if (ABV == 0)
f_ricci_gamma(ex, X, Y, Z,
chi_constraints,
dxx, gxy, gxz, dyy, gyz, dzz,
Gamx, Gamy, Gamz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Symmetry);
#endif
f_constraint_bssn(ex, X, Y, Z,
chi_constraints, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry);
}
return 0;
}
extern "C" int f_compute_rhs_Z4c(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *TZ,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *TZ_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
int &Symmetry, int &Lev, double &eps, int &co)
{
return compute_rhs_z4c_cartesian(
ex, T, X, Y, Z,
chi, chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz,
dtSfx, dtSfy, dtSfz,
TZ,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs,
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
TZ_rhs,
rho, Sx, Sy, Sz,
Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry, Lev, eps, co);
}
extern "C" int f_compute_rhs_Z4cnot(int *ex, double &T,
double *X, double *Y, double *Z,
double *chi, double *trK,
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
double *Gamx, double *Gamy, double *Gamz,
double *Lap, double *betax, double *betay, double *betaz,
double *dtSfx, double *dtSfy, double *dtSfz,
double *TZ,
double *chi_rhs, double *trK_rhs,
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
double *TZ_rhs,
double *rho, double *Sx, double *Sy, double *Sz,
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
int &Symmetry, int &Lev, double &eps, int &co, double &chitiny)
{
const int all = ex[0] * ex[1] * ex[2];
std::vector<double> chi_clamped(chi, chi + all);
f_lowerboundset(ex, chi_clamped.data(), chitiny);
const int ret = compute_rhs_z4c_cartesian(
ex, T, X, Y, Z,
chi_clamped.data(), chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz,
dtSfx, dtSfy, dtSfz,
TZ,
chi_rhs, trK_rhs,
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
Gamx_rhs, Gamy_rhs, Gamz_rhs,
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
TZ_rhs,
rho, Sx, Sy, Sz,
Sxx, Sxy, Sxz, Syy, Syz, Szz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry, Lev, eps, co);
if (ret != 0 || co != 0)
return ret;
#if (ABV == 0)
f_ricci_gamma(ex, X, Y, Z,
chi,
dxx, gxy, gxz, dyy, gyz, dzz,
Gamx, Gamy, Gamz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Symmetry);
#endif
f_constraint_bssn(ex, X, Y, Z,
chi, trK,
dxx, gxy, gxz, dyy, gyz, dzz,
Axx, Axy, Axz, Ayy, Ayz, Azz,
Gamx, Gamy, Gamz,
Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
Symmetry);
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,83 @@
#ifndef Z4C_RHS_CUDA_H
#define Z4C_RHS_CUDA_H
#ifdef __cplusplus
extern "C" {
#endif
enum {
Z4C_CUDA_STATE_COUNT = 25
};
int z4c_cuda_rk4_substep(void *block_tag,
int *ex, double *X, double *Y, double *Z,
double **state_host_in,
double **state_host_out,
const double *propspeed,
const double *soa_flat,
const double *bbox,
double &dT,
double &T,
int &RK4,
int &apply_bam_bc,
int &Symmetry,
int &Lev,
double &eps,
int &co,
int &keep_resident_state,
int &apply_enforce_ga,
double &chitiny);
int z4c_cuda_download_resident_state(void *block_tag,
int *ex,
double **state_host_out);
int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag,
int state_index,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
int state_count,
double *host_buffer,
int *ex,
int i0, int j0, int k0,
int sx, int sy, int sz);
int z4c_cuda_download_state_subset(void *block_tag,
int *ex,
int subset_count,
const int *state_indices,
double **state_host_out);
int z4c_cuda_upload_state_subset(void *block_tag,
int *ex,
int subset_count,
const int *state_indices,
double **state_host_in);
int z4c_cuda_has_resident_state(void *block_tag);
void z4c_cuda_release_step_ctx(void *block_tag);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,211 +0,0 @@
# BSSN Build Config Migration
This note records the build-configuration fix needed when replacing
`AMSS_NCKU_Input.py` or `generate_macrodef.py` with a newer upstream version.
## Problem
`AMSS_NCKU_source/macrodef.h` is not the authoritative file used by normal
runs. `AMSS_NCKU_Program.py` first generates macro files under
`input_data.File_directory`, copies `AMSS_NCKU_source` to
`<File_directory>/AMSS_NCKU_source_copy`, then copies the generated macro files
into that copied source tree and compiles there.
Therefore, makefile logic must not depend only on the stale
`AMSS_NCKU_source/macrodef.h`. The actual equation path must be passed to the
copied build tree from the same generation step that creates `macrodef.h`.
The performance regression was caused by compiling/linking the
`BSSN-EScalar` C wrapper into BSSN vacuum builds. For BSSN vacuum (`ABEtype=0`),
the build must use:
```make
BSSN_USE_TRANSFER_CACHE=1
BSSN_USE_ESCALAR_C_KERNEL=0
```
and must not link `bssn_escalar_rhs_c.o`.
## Required Migration Steps
### 1. Add an ABE type helper in `generate_macrodef.py`
Add a helper that maps `input_data.Equation_Class` to the numeric `ABEtype`.
Use the same mapping as `macrodef.h`:
```python
def get_abe_type():
if ( input_data.Equation_Class == "BSSN" ):
return 0
elif ( input_data.Equation_Class == "BSSN-EScalar" ):
return 1
elif ( input_data.Equation_Class == "BSSN-EM" ):
return 3
elif ( input_data.Equation_Class == "Z4C" ):
return 2
else:
raise ValueError("Equation_Class setting error!!!")
```
Update `generate_macrodef_h()` to print `#define ABEtype {get_abe_type()}`
instead of duplicating the if/elif mapping.
### 2. Generate a makefile fragment
In `generate_macrodef.py`, add:
```python
def generate_build_config():
file1 = open(os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
print("# Generated by generate_macrodef.py; do not edit manually.", file=file1)
print(f"ABE_TYPE := {get_abe_type()}", file=file1)
file1.close()
```
This file is the build-time authority for the equation path.
### 3. Call and copy the generated build config
In `AMSS_NCKU_Program.py`, after generating `macrodef.h` and `macrodef.fh`, call:
```python
generate_macrodef.generate_build_config()
print(" AMSS-NCKU build config AMSS_NCKU_build.mk has been generated. ")
```
When copying generated files into `AMSS_NCKU_source_copy`, also copy:
```python
build_config_path = os.path.join(File_directory, "AMSS_NCKU_build.mk")
shutil.copy2(build_config_path, AMSS_NCKU_source_copy)
```
### 4. Make the source makefile consume the generated config
At the top of `AMSS_NCKU_source/makefile`, after `include makefile.inc`, add:
```make
-include AMSS_NCKU_build.mk
ABE_TYPE ?= $(shell awk '/^[[:space:]]*\#define[[:space:]]+ABEtype/ {print $$3; exit}' macrodef.h 2>/dev/null)
```
The generated `AMSS_NCKU_build.mk` is used during normal Python-driven builds.
The fallback keeps manual source-tree builds usable.
### 5. Gate path-specific build options by `ABE_TYPE`
Use effective build switches:
```make
ifeq ($(USE_TRANSFER_CACHE),auto)
ifeq ($(ABE_TYPE),0)
EFFECTIVE_USE_TRANSFER_CACHE = 1
else
EFFECTIVE_USE_TRANSFER_CACHE = 0
endif
else
EFFECTIVE_USE_TRANSFER_CACHE = $(USE_TRANSFER_CACHE)
endif
ifeq ($(USE_CXX_ESCALAR_KERNEL),1)
ifeq ($(ABE_TYPE),1)
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 1
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
else
EFFECTIVE_USE_CXX_ESCALAR_KERNEL = 0
endif
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
```
Only add `bssn_escalar_rhs_c.o` when the effective EScalar C kernel switch is
enabled:
```make
ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
CFILES += bssn_escalar_rhs_c.o
endif
```
### 6. Use safe transfer-cache default
In `AMSS_NCKU_source/makefile.inc`, keep:
```make
USE_TRANSFER_CACHE ?= auto
```
With the effective switch logic above, this enables cached transfer for BSSN
vacuum while keeping non-BSSN paths on the uncached path by default.
## Verification Checklist
Run these checks after migrating:
```bash
python3 -c "import generate_macrodef; generate_macrodef.generate_build_config()"
cat GW150914/AMSS_NCKU_build.mk
```
For BSSN, the generated file should contain:
```make
ABE_TYPE := 0
```
Dry-run the copied or source makefile:
```bash
make -n -B INTERP_LB_MODE=off ABE | grep -E 'BSSN_USE_TRANSFER_CACHE|BSSN_USE_ESCALAR_C_KERNEL|bssn_escalar_rhs_c'
```
Expected BSSN result:
```text
-DBSSN_USE_TRANSFER_CACHE=1 -DBSSN_USE_ESCALAR_C_KERNEL=0
```
and no `bssn_escalar_rhs_c.o` in the final link command.
Run the full workflow:
```bash
python3 AMSS_NCKU_Program.py
```
For the 10-step BSSN test, compare coordinate output:
```bash
python3 - <<'PY'
from pathlib import Path
old = Path('../GW150914-06457/AMSS_NCKU_output/bssn_BH.dat')
new = Path('GW150914/AMSS_NCKU_output/bssn_BH.dat')
def rows(path):
out = []
for line in path.read_text().splitlines():
if not line.strip() or line.lstrip().startswith('#'):
continue
out.append([float(x) for x in line.split()])
return out
ro, rn = rows(old), rows(new)
n = min(len(ro), len(rn))
max_abs = 0.0
for i in range(n):
for a, b in zip(ro[i], rn[i]):
max_abs = max(max_abs, abs(a - b))
print(f"old_rows={len(ro)} new_rows={len(rn)} compared_rows={n}")
print(f"max_abs_diff={max_abs:.17g}")
PY
```
For the validated migration, the first 10 rows matched exactly:
```text
max_abs_diff=0
```

View File

@@ -12,37 +12,6 @@ import os
import AMSS_NCKU_Input as input_data ## import program input file
##################################################################
def get_abe_type():
if ( input_data.Equation_Class == "BSSN" ):
return 0
elif ( input_data.Equation_Class == "BSSN-EScalar" ):
return 1
elif ( input_data.Equation_Class == "BSSN-EM" ):
return 3
elif ( input_data.Equation_Class == "Z4C" ):
return 2
else:
raise ValueError("Equation_Class setting error!!!")
##################################################################
## Generate the makefile fragment used by the copied source tree.
## The source-tree macrodef.h is not authoritative because macro files
## are regenerated under File_directory for each run.
def generate_build_config():
file1 = open( os.path.join(input_data.File_directory, "AMSS_NCKU_build.mk"), "w")
print( "# Generated by generate_macrodef.py; do not edit manually.", file=file1 )
print( f"ABE_TYPE := {get_abe_type()}", file=file1 )
file1.close()
##################################################################
## Generate the macro file macrodef.h according to user settings
@@ -89,10 +58,19 @@ def generate_macrodef_h():
# 2: Z4c vacuum
# 3: coupled to Maxwell field
try:
print( f"#define ABEtype {get_abe_type()}", file=file1 )
print( file=file1 )
except ValueError:
if ( input_data.Equation_Class == "BSSN" ):
print( "#define ABEtype 0", file=file1 )
print( file=file1 )
elif ( input_data.Equation_Class == "BSSN-EScalar" ):
print( "#define ABEtype 1", file=file1 )
print( file=file1 )
elif ( input_data.Equation_Class == "BSSN-EM" ):
print( "#define ABEtype 3", file=file1 )
print( file=file1 )
elif ( input_data.Equation_Class == "Z4C" ):
print( "#define ABEtype 2", file=file1 )
print( file=file1 )
else:
print( "Equation_Class setting error!!!" )
print()
print( "# Equation type #define ABEtype setting error!!!", file=file1 )
@@ -226,7 +204,7 @@ def generate_macrodef_h():
# use GPU or not
if ( input_data.GPU_Calculation == "yes"):
print( "#define USE_GPU", file=file1 )
print( "//#define USE_GPU", file=file1 )
print( file=file1 )
elif ( input_data.GPU_Calculation == "no"):
print( "//#define USE_GPU", file=file1 )

View File

@@ -70,9 +70,9 @@ def makefile_ABE():
## Build command with CPU binding to nohz_full cores
if (input_data.GPU_Calculation == "no"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE"
elif (input_data.GPU_Calculation == "yes"):
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA"
else:
print( " CPU/GPU numerical calculation setting is wrong " )
print( )
@@ -151,7 +151,7 @@ def run_ABE():
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
mpi_command_outfile = "ABE_out.log"
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA"
mpi_command_outfile = "ABEGPU_out.log"
## Execute the MPI command and stream output