Compare commits
11 Commits
chb-cuda-n
...
legacy
| Author | SHA1 | Date | |
|---|---|---|---|
| 3f3f16e881 | |||
| 9c31384b2f | |||
| e4e741caa1 | |||
| 65e0f95f40 | |||
| f9fbf97e64 | |||
| 968522995b | |||
| f3988ac8ca | |||
| e4c25eb21f | |||
| 4b10519876 | |||
| 3a58273501 | |||
| 5c65cea2f0 |
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,6 +1,6 @@
|
||||
__pycache__
|
||||
GW150914
|
||||
GW150914*
|
||||
GW150914-origin
|
||||
docs
|
||||
*.tmp
|
||||
.codex
|
||||
|
||||
|
||||
@@ -16,9 +16,9 @@ import numpy
|
||||
File_directory = "GW150914" ## output file directory
|
||||
Output_directory = "binary_output" ## binary data file directory
|
||||
## The file directory name should not be too long
|
||||
MPI_processes = 8 ## number of mpi processes used in the simulation
|
||||
MPI_processes = 64 ## number of mpi processes used in the simulation
|
||||
|
||||
GPU_Calculation = "yes" ## Use GPU or not
|
||||
GPU_Calculation = "no" ## Use GPU or not
|
||||
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
|
||||
CPU_Part = 1.0
|
||||
GPU_Part = 0.0
|
||||
|
||||
@@ -258,7 +258,7 @@ print()
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE")
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABE_CUDA")
|
||||
ABE_file = os.path.join(AMSS_NCKU_source_copy, "ABEGPU")
|
||||
|
||||
if not os.path.exists( ABE_file ):
|
||||
print( )
|
||||
|
||||
@@ -2,18 +2,13 @@
|
||||
"""
|
||||
AMSS-NCKU GW150914 Simulation Regression Test Script (Comprehensive Version)
|
||||
|
||||
Verification Requirements:
|
||||
1. RMS errors < 1% for:
|
||||
- 3D Vector Total RMS
|
||||
- X Component RMS
|
||||
- Y Component RMS
|
||||
- Z Component RMS
|
||||
2. ADM constraint violation < 2 (Grid Level 0)
|
||||
3. The following figure PDFs must match GW150914-origin exactly after rasterization:
|
||||
- ADM_Constraint_Grid_Level_0.pdf
|
||||
- BH_Trajectory_21_XY.pdf
|
||||
- BH_Trajectory_XY.pdf
|
||||
The script also reports the percentage of differing pixels for each figure.
|
||||
Verification Requirements:
|
||||
1. RMS errors < 1% for:
|
||||
- 3D Vector Total RMS
|
||||
- X Component RMS
|
||||
- Y Component RMS
|
||||
- Z Component RMS
|
||||
2. ADM constraint violation < 2 (Grid Level 0)
|
||||
|
||||
RMS Calculation Method:
|
||||
- Computes trajectory deviation on the XY plane independently for BH1 and BH2
|
||||
@@ -25,13 +20,9 @@ Default: output_dir = GW150914/AMSS_NCKU_output
|
||||
Reference: GW150914-origin (baseline simulation)
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import tempfile
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
|
||||
# ANSI Color Codes
|
||||
class Color:
|
||||
@@ -58,143 +49,17 @@ def load_bh_trajectory(filepath):
|
||||
}
|
||||
|
||||
|
||||
def load_constraint_data(filepath):
|
||||
"""Load constraint violation data"""
|
||||
data = []
|
||||
def load_constraint_data(filepath):
|
||||
"""Load constraint violation data"""
|
||||
data = []
|
||||
with open(filepath, 'r') as f:
|
||||
for line in f:
|
||||
if line.startswith('#'):
|
||||
continue
|
||||
parts = line.split()
|
||||
if len(parts) >= 8:
|
||||
data.append([float(x) for x in parts[:8]])
|
||||
return np.array(data)
|
||||
|
||||
|
||||
def resolve_figure_dir(path):
|
||||
"""Resolve the sibling figure directory from an output or figure path."""
|
||||
normalized = os.path.normpath(path)
|
||||
if os.path.basename(normalized) == "figure":
|
||||
return normalized
|
||||
return os.path.join(os.path.dirname(normalized), "figure")
|
||||
|
||||
|
||||
def render_pdf_to_images(pdf_path, dpi=150):
|
||||
"""Render a PDF to RGB images using Ghostscript."""
|
||||
gs_path = shutil.which("gs")
|
||||
if gs_path is None:
|
||||
raise RuntimeError("Ghostscript executable 'gs' was not found in PATH")
|
||||
|
||||
with tempfile.TemporaryDirectory(prefix="amss_verify_pdf_") as temp_dir:
|
||||
output_pattern = os.path.join(temp_dir, "page-%03d.ppm")
|
||||
cmd = [
|
||||
gs_path,
|
||||
"-q",
|
||||
"-dSAFER",
|
||||
"-dBATCH",
|
||||
"-dNOPAUSE",
|
||||
"-sDEVICE=ppmraw",
|
||||
f"-r{dpi}",
|
||||
f"-o{output_pattern}",
|
||||
pdf_path
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, text=True)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
message = exc.stderr.strip() or str(exc)
|
||||
raise RuntimeError(f"Failed to render PDF '{pdf_path}': {message}") from exc
|
||||
|
||||
ppm_files = sorted(
|
||||
os.path.join(temp_dir, filename)
|
||||
for filename in os.listdir(temp_dir)
|
||||
if filename.endswith(".ppm")
|
||||
)
|
||||
|
||||
if not ppm_files:
|
||||
raise RuntimeError(f"No rendered pages were produced for '{pdf_path}'")
|
||||
|
||||
images = []
|
||||
for ppm_file in ppm_files:
|
||||
with Image.open(ppm_file) as img:
|
||||
images.append(np.array(img.convert("RGB"), dtype=np.uint8))
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def compare_rendered_pages(ref_img, target_img):
|
||||
"""Return (different_pixels, total_pixels) for two rendered RGB pages."""
|
||||
ref_h, ref_w = ref_img.shape[:2]
|
||||
tgt_h, tgt_w = target_img.shape[:2]
|
||||
total_pixels = max(ref_h, tgt_h) * max(ref_w, tgt_w)
|
||||
|
||||
if ref_h == tgt_h and ref_w == tgt_w:
|
||||
different_pixels = int(np.count_nonzero(np.any(ref_img != target_img, axis=2)))
|
||||
return different_pixels, total_pixels
|
||||
|
||||
diff_mask = np.ones((max(ref_h, tgt_h), max(ref_w, tgt_w)), dtype=bool)
|
||||
overlap_h = min(ref_h, tgt_h)
|
||||
overlap_w = min(ref_w, tgt_w)
|
||||
overlap_diff = np.any(ref_img[:overlap_h, :overlap_w] != target_img[:overlap_h, :overlap_w], axis=2)
|
||||
diff_mask[:overlap_h, :overlap_w] = overlap_diff
|
||||
different_pixels = int(np.count_nonzero(diff_mask))
|
||||
return different_pixels, total_pixels
|
||||
|
||||
|
||||
def compare_pdf_images(ref_pdf, target_pdf, dpi=150, threshold_percent=0.001):
|
||||
"""Compare two PDFs by rasterizing them and counting differing pixels."""
|
||||
ref_pages = render_pdf_to_images(ref_pdf, dpi=dpi)
|
||||
target_pages = render_pdf_to_images(target_pdf, dpi=dpi)
|
||||
|
||||
total_pixels = 0
|
||||
different_pixels = 0
|
||||
max_pages = max(len(ref_pages), len(target_pages))
|
||||
|
||||
for page_idx in range(max_pages):
|
||||
if page_idx < len(ref_pages) and page_idx < len(target_pages):
|
||||
page_diff, page_total = compare_rendered_pages(ref_pages[page_idx], target_pages[page_idx])
|
||||
else:
|
||||
existing_page = ref_pages[page_idx] if page_idx < len(ref_pages) else target_pages[page_idx]
|
||||
page_total = existing_page.shape[0] * existing_page.shape[1]
|
||||
page_diff = page_total
|
||||
|
||||
total_pixels += page_total
|
||||
different_pixels += page_diff
|
||||
|
||||
diff_percent = (different_pixels / total_pixels * 100.0) if total_pixels else 0.0
|
||||
return {
|
||||
"different_pixels": different_pixels,
|
||||
"total_pixels": total_pixels,
|
||||
"diff_percent": diff_percent,
|
||||
"pages_ref": len(ref_pages),
|
||||
"pages_target": len(target_pages),
|
||||
"passed": diff_percent < threshold_percent
|
||||
}
|
||||
|
||||
|
||||
def compare_required_figures(reference_figure_dir, target_figure_dir):
|
||||
"""Compare the required GW150914 figure PDFs."""
|
||||
figure_names = [
|
||||
"ADM_Constraint_Grid_Level_0.pdf",
|
||||
"BH_Trajectory_21_XY.pdf",
|
||||
"BH_Trajectory_XY.pdf"
|
||||
]
|
||||
|
||||
results = []
|
||||
for figure_name in figure_names:
|
||||
ref_pdf = os.path.join(reference_figure_dir, figure_name)
|
||||
target_pdf = os.path.join(target_figure_dir, figure_name)
|
||||
|
||||
if not os.path.exists(ref_pdf):
|
||||
raise FileNotFoundError(f"Reference figure not found: {ref_pdf}")
|
||||
if not os.path.exists(target_pdf):
|
||||
raise FileNotFoundError(f"Target figure not found: {target_pdf}")
|
||||
|
||||
comparison = compare_pdf_images(ref_pdf, target_pdf)
|
||||
comparison["name"] = figure_name
|
||||
results.append(comparison)
|
||||
|
||||
return results
|
||||
data.append([float(x) for x in parts[:8]])
|
||||
return np.array(data)
|
||||
|
||||
def calculate_all_rms_errors(bh_data_ref, bh_data_target):
|
||||
"""
|
||||
@@ -300,7 +165,7 @@ def print_rms_results(rms_dict, error, threshold=1.0):
|
||||
|
||||
return all_passed
|
||||
|
||||
def print_constraint_results(results, threshold=2.0):
|
||||
def print_constraint_results(results, threshold=2.0):
|
||||
print(f"\n{Color.BOLD}2. ADM Constraint Violation Analysis (Grid Level 0){Color.RESET}")
|
||||
print("-" * 65)
|
||||
|
||||
@@ -315,49 +180,22 @@ def print_constraint_results(results, threshold=2.0):
|
||||
print(f"\n Maximum violation: {results['max_violation']:.6f}")
|
||||
print(f" Requirement: < {threshold}")
|
||||
print(f" Status: {get_status_text(passed)}")
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def print_figure_results(results, threshold_percent=0.001):
|
||||
print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
|
||||
print("-" * 65)
|
||||
print(f" Requirement: < {threshold_percent:.3f}% differing pixels\n")
|
||||
|
||||
all_passed = True
|
||||
for result in results:
|
||||
passed = result["passed"]
|
||||
all_passed = all_passed and passed
|
||||
status = get_status_text(passed)
|
||||
print(f" {result['name']:32}: {result['diff_percent']:10.6f}% | Status: {status}")
|
||||
|
||||
if result["pages_ref"] != result["pages_target"]:
|
||||
print(f" {'':32} pages(ref/target): {result['pages_ref']}/{result['pages_target']}")
|
||||
|
||||
return all_passed
|
||||
|
||||
|
||||
def print_figure_error(error_message):
|
||||
print(f"\n{Color.BOLD}3. Figure Pixel Comparison (PDF Rasterization){Color.RESET}")
|
||||
print("-" * 65)
|
||||
print(f" {Color.RED}Error: {error_message}{Color.RESET}")
|
||||
return False
|
||||
|
||||
|
||||
def print_summary(rms_passed, constraint_passed, figure_passed):
|
||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||
print(Color.BOLD + "Verification Summary" + Color.RESET)
|
||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||
|
||||
all_passed = rms_passed and constraint_passed and figure_passed
|
||||
|
||||
res_rms = get_status_text(rms_passed)
|
||||
res_con = get_status_text(constraint_passed)
|
||||
res_fig = get_status_text(figure_passed)
|
||||
|
||||
print(f" [1] Comprehensive RMS check: {res_rms}")
|
||||
print(f" [2] ADM constraint check: {res_con}")
|
||||
print(f" [3] Figure pixel comparison: {res_fig}")
|
||||
|
||||
return passed
|
||||
|
||||
|
||||
def print_summary(rms_passed, constraint_passed):
|
||||
print("\n" + Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||
print(Color.BOLD + "Verification Summary" + Color.RESET)
|
||||
print(Color.BLUE + Color.BOLD + "=" * 65 + Color.RESET)
|
||||
|
||||
all_passed = rms_passed and constraint_passed
|
||||
|
||||
res_rms = get_status_text(rms_passed)
|
||||
res_con = get_status_text(constraint_passed)
|
||||
|
||||
print(f" [1] Comprehensive RMS check: {res_rms}")
|
||||
print(f" [2] ADM constraint check: {res_con}")
|
||||
|
||||
final_status = f"{Color.GREEN}{Color.BOLD}ALL CHECKS PASSED{Color.RESET}" if all_passed else f"{Color.RED}{Color.BOLD}SOME CHECKS FAILED{Color.RESET}"
|
||||
print(f"\n Overall result: {final_status}")
|
||||
@@ -372,14 +210,12 @@ def main():
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
target_dir = os.path.join(script_dir, "GW150914/AMSS_NCKU_output")
|
||||
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
||||
target_figure_dir = resolve_figure_dir(target_dir)
|
||||
reference_figure_dir = os.path.join(script_dir, "GW150914-origin/figure")
|
||||
|
||||
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
||||
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
||||
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
reference_dir = os.path.join(script_dir, "GW150914-origin/AMSS_NCKU_output")
|
||||
|
||||
bh_file_ref = os.path.join(reference_dir, "bssn_BH.dat")
|
||||
bh_file_target = os.path.join(target_dir, "bssn_BH.dat")
|
||||
constraint_file = os.path.join(target_dir, "bssn_constraint.dat")
|
||||
|
||||
if not os.path.exists(bh_file_ref):
|
||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Baseline trajectory file not found: {bh_file_ref}")
|
||||
@@ -391,11 +227,9 @@ def main():
|
||||
print(f"{Color.RED}{Color.BOLD}Error:{Color.RESET} Constraint data file not found: {constraint_file}")
|
||||
sys.exit(1)
|
||||
|
||||
print_header()
|
||||
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
||||
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
||||
print(f"{Color.BOLD}Reference Figures: {Color.RESET} {Color.BLUE}{reference_figure_dir}{Color.RESET}")
|
||||
print(f"{Color.BOLD}Target Figures: {Color.RESET} {Color.BLUE}{target_figure_dir}{Color.RESET}")
|
||||
print_header()
|
||||
print(f"\n{Color.BOLD}Reference (Baseline):{Color.RESET} {Color.BLUE}{reference_dir}{Color.RESET}")
|
||||
print(f"{Color.BOLD}Target (Optimized): {Color.RESET} {Color.BLUE}{target_dir}{Color.RESET}")
|
||||
|
||||
bh_data_ref = load_bh_trajectory(bh_file_ref)
|
||||
bh_data_target = load_bh_trajectory(bh_file_target)
|
||||
@@ -405,18 +239,12 @@ def main():
|
||||
rms_dict, error = calculate_all_rms_errors(bh_data_ref, bh_data_target)
|
||||
rms_passed = print_rms_results(rms_dict, error)
|
||||
|
||||
# Output constraint results
|
||||
constraint_results = analyze_constraint_violation(constraint_data)
|
||||
constraint_passed = print_constraint_results(constraint_results)
|
||||
|
||||
try:
|
||||
figure_results = compare_required_figures(reference_figure_dir, target_figure_dir)
|
||||
figure_passed = print_figure_results(figure_results)
|
||||
except (FileNotFoundError, RuntimeError) as exc:
|
||||
figure_passed = print_figure_error(str(exc))
|
||||
|
||||
all_passed = print_summary(rms_passed, constraint_passed, figure_passed)
|
||||
sys.exit(0 if all_passed else 1)
|
||||
# Output constraint results
|
||||
constraint_results = analyze_constraint_violation(constraint_data)
|
||||
constraint_passed = print_constraint_results(constraint_results)
|
||||
|
||||
all_passed = print_summary(rms_passed, constraint_passed)
|
||||
sys.exit(0 if all_passed else 1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
@@ -37,51 +37,56 @@ close(77)
|
||||
end program checkFFT
|
||||
#endif
|
||||
|
||||
!-------------
|
||||
! Optimized FFT using Intel oneMKL DFTI
|
||||
! Mathematical equivalence: Standard DFT definition
|
||||
! Forward (isign=1): X[k] = sum_{n=0}^{N-1} x[n] * exp(-2*pi*i*k*n/N)
|
||||
! Backward (isign=-1): X[k] = sum_{n=0}^{N-1} x[n] * exp(+2*pi*i*k*n/N)
|
||||
! Input/Output: dataa is interleaved complex array [Re(0),Im(0),Re(1),Im(1),...]
|
||||
!-------------
|
||||
SUBROUTINE four1(dataa,nn,isign)
|
||||
use MKL_DFTI
|
||||
implicit none
|
||||
INTEGER, intent(in) :: isign, nn
|
||||
DOUBLE PRECISION, dimension(2*nn), intent(inout) :: dataa
|
||||
|
||||
type(DFTI_DESCRIPTOR), pointer :: desc
|
||||
integer :: status
|
||||
|
||||
! Create DFTI descriptor for 1D complex-to-complex transform
|
||||
status = DftiCreateDescriptor(desc, DFTI_DOUBLE, DFTI_COMPLEX, 1, nn)
|
||||
if (status /= 0) return
|
||||
|
||||
! Set input/output storage as interleaved complex (default)
|
||||
status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
|
||||
if (status /= 0) then
|
||||
status = DftiFreeDescriptor(desc)
|
||||
return
|
||||
endif
|
||||
|
||||
! Commit the descriptor
|
||||
status = DftiCommitDescriptor(desc)
|
||||
if (status /= 0) then
|
||||
status = DftiFreeDescriptor(desc)
|
||||
return
|
||||
endif
|
||||
|
||||
! Execute FFT based on direction
|
||||
if (isign == 1) then
|
||||
! Forward FFT: exp(-2*pi*i*k*n/N)
|
||||
status = DftiComputeForward(desc, dataa)
|
||||
else
|
||||
! Backward FFT: exp(+2*pi*i*k*n/N)
|
||||
status = DftiComputeBackward(desc, dataa)
|
||||
endif
|
||||
|
||||
! Free descriptor
|
||||
status = DftiFreeDescriptor(desc)
|
||||
|
||||
return
|
||||
END SUBROUTINE four1
|
||||
SUBROUTINE four1(dataa,nn,isign)
|
||||
implicit none
|
||||
INTEGER::isign,nn
|
||||
double precision,dimension(2*nn)::dataa
|
||||
INTEGER::i,istep,j,m,mmax,n
|
||||
double precision::tempi,tempr
|
||||
DOUBLE PRECISION::theta,wi,wpi,wpr,wr,wtemp
|
||||
n=2*nn
|
||||
j=1
|
||||
do i=1,n,2
|
||||
if(j.gt.i)then
|
||||
tempr=dataa(j)
|
||||
tempi=dataa(j+1)
|
||||
dataa(j)=dataa(i)
|
||||
dataa(j+1)=dataa(i+1)
|
||||
dataa(i)=tempr
|
||||
dataa(i+1)=tempi
|
||||
endif
|
||||
m=nn
|
||||
1 if ((m.ge.2).and.(j.gt.m)) then
|
||||
j=j-m
|
||||
m=m/2
|
||||
goto 1
|
||||
endif
|
||||
j=j+m
|
||||
enddo
|
||||
mmax=2
|
||||
2 if (n.gt.mmax) then
|
||||
istep=2*mmax
|
||||
theta=6.28318530717959d0/(isign*mmax)
|
||||
wpr=-2.d0*sin(0.5d0*theta)**2
|
||||
wpi=sin(theta)
|
||||
wr=1.d0
|
||||
wi=0.d0
|
||||
do m=1,mmax,2
|
||||
do i=m,n,istep
|
||||
j=i+mmax
|
||||
tempr=sngl(wr)*dataa(j)-sngl(wi)*dataa(j+1)
|
||||
tempi=sngl(wr)*dataa(j+1)+sngl(wi)*dataa(j)
|
||||
dataa(j)=dataa(i)-tempr
|
||||
dataa(j+1)=dataa(i+1)-tempi
|
||||
dataa(i)=dataa(i)+tempr
|
||||
dataa(i+1)=dataa(i+1)+tempi
|
||||
enddo
|
||||
wtemp=wr
|
||||
wr=wr*wpr-wi*wpi+wr
|
||||
wi=wi*wpr+wtemp*wpi+wi
|
||||
enddo
|
||||
mmax=istep
|
||||
goto 2
|
||||
endif
|
||||
return
|
||||
END SUBROUTINE four1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -100,14 +100,12 @@ namespace Parallel
|
||||
MyList<gridseg> **combined_dst;
|
||||
int *send_lengths;
|
||||
int *recv_lengths;
|
||||
double **send_bufs;
|
||||
double **recv_bufs;
|
||||
int *send_buf_caps;
|
||||
int *recv_buf_caps;
|
||||
unsigned char *send_buf_pinned;
|
||||
unsigned char *recv_buf_pinned;
|
||||
MPI_Request *reqs;
|
||||
MPI_Status *stats;
|
||||
double **send_bufs;
|
||||
double **recv_bufs;
|
||||
int *send_buf_caps;
|
||||
int *recv_buf_caps;
|
||||
MPI_Request *reqs;
|
||||
MPI_Status *stats;
|
||||
int max_reqs;
|
||||
bool lengths_valid;
|
||||
int *tc_req_node;
|
||||
@@ -118,11 +116,10 @@ namespace Parallel
|
||||
void destroy();
|
||||
};
|
||||
|
||||
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
|
||||
void Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache);
|
||||
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
void Sync_cached(MyList<Patch> *PatL, MyList<var> *VarList, int Symmetry, SyncCache &cache);
|
||||
void transfer_cached(MyList<gridseg> **src, MyList<gridseg> **dst,
|
||||
MyList<var> *VarList1, MyList<var> *VarList2,
|
||||
int Symmetry, SyncCache &cache);
|
||||
|
||||
struct AsyncSyncState {
|
||||
int req_no;
|
||||
|
||||
@@ -25,9 +25,23 @@ using namespace std;
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
#endif
|
||||
|
||||
#include "TwoPunctures.h"
|
||||
#include <mkl_cblas.h>
|
||||
|
||||
#include "TwoPunctures.h"
|
||||
|
||||
extern "C" {
|
||||
double cblas_ddot(const int, const double *, const int, const double *, const int);
|
||||
double cblas_dnrm2(const int, const double *, const int);
|
||||
void cblas_dgemm(const int, const int, const int,
|
||||
const int, const int, const int,
|
||||
const double, const double *, const int,
|
||||
const double *, const int, const double,
|
||||
double *, const int);
|
||||
}
|
||||
|
||||
enum {
|
||||
CblasRowMajor = 101,
|
||||
CblasNoTrans = 111
|
||||
};
|
||||
|
||||
TwoPunctures::TwoPunctures(double mp, double mm, double b,
|
||||
double P_plusx, double P_plusy, double P_plusz,
|
||||
|
||||
@@ -26,20 +26,12 @@ using namespace std;
|
||||
#include "shellfunctions.h"
|
||||
#include "cpbc.h"
|
||||
#include "kodiss.h"
|
||||
#include "parameters.h"
|
||||
|
||||
#ifndef USE_CUDA_Z4C
|
||||
#define USE_CUDA_Z4C 0
|
||||
#endif
|
||||
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
#include "z4c_rhs_cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef With_AHF
|
||||
#include "derivatives.h"
|
||||
#include "myglobal.h"
|
||||
#endif
|
||||
#include "parameters.h"
|
||||
|
||||
#ifdef With_AHF
|
||||
#include "derivatives.h"
|
||||
#include "myglobal.h"
|
||||
#endif
|
||||
|
||||
//================================================================================================
|
||||
|
||||
@@ -175,554 +167,12 @@ Z4c_class::~Z4c_class()
|
||||
|
||||
#define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
|
||||
|
||||
#ifndef CPBC
|
||||
// for sommerfeld boundary
|
||||
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
#ifdef WithShell
|
||||
#error "USE_CUDA_Z4C resident path currently supports Cartesian non-shell Z4C only"
|
||||
#endif
|
||||
#if (MRBD == 2)
|
||||
#error "USE_CUDA_Z4C resident path does not support MRBD == 2"
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
|
||||
static const int k_z4c_cuda_bh_state_indices[3] = {18, 19, 20};
|
||||
|
||||
bool fill_z4c_cuda_views(Block *cg, MyList<var> *vars,
|
||||
double **host_views,
|
||||
double *propspeeds = 0,
|
||||
double *soa_flat = 0)
|
||||
{
|
||||
int idx = 0;
|
||||
while (vars && idx < Z4C_CUDA_STATE_COUNT)
|
||||
{
|
||||
host_views[idx] = cg->fgfs[vars->data->sgfn];
|
||||
if (propspeeds)
|
||||
propspeeds[idx] = vars->data->propspeed;
|
||||
if (soa_flat)
|
||||
{
|
||||
soa_flat[3 * idx + 0] = vars->data->SoA[0];
|
||||
soa_flat[3 * idx + 1] = vars->data->SoA[1];
|
||||
soa_flat[3 * idx + 2] = vars->data->SoA[2];
|
||||
}
|
||||
vars = vars->next;
|
||||
++idx;
|
||||
}
|
||||
return idx == Z4C_CUDA_STATE_COUNT && vars == 0;
|
||||
}
|
||||
|
||||
void z4c_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int myrank, bool release_ctx)
|
||||
{
|
||||
MyList<Patch> *Pp = PatL;
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && z4c_cuda_has_resident_state(cg))
|
||||
{
|
||||
double *state_out[Z4C_CUDA_STATE_COUNT];
|
||||
if (!fill_z4c_cuda_views(cg, vars, state_out))
|
||||
{
|
||||
cout << "CUDA Z4C state list mismatch on resident state download" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (z4c_cuda_download_resident_state(cg, cg->shape, state_out))
|
||||
{
|
||||
cout << "CUDA Z4C resident state download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
if (release_ctx)
|
||||
z4c_cuda_release_step_ctx(cg);
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
}
|
||||
|
||||
bool z4c_cuda_patch_contains_point(Patch *patch, const double *point)
|
||||
{
|
||||
if (!patch)
|
||||
return false;
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
const double h = patch->getdX(d);
|
||||
const double lo = patch->bbox[d] + patch->lli[d] * h;
|
||||
const double hi = patch->bbox[dim + d] - patch->uui[d] * h;
|
||||
if (point[d] < lo || point[d] > hi)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool z4c_cuda_point_in_block(Patch *patch, Block *block,
|
||||
const double *point, const double *DH)
|
||||
{
|
||||
if (!patch || !block)
|
||||
return false;
|
||||
for (int d = 0; d < dim; d++)
|
||||
{
|
||||
double llb;
|
||||
double uub;
|
||||
#ifdef Vertex
|
||||
#ifdef Cell
|
||||
#error Both Cell and Vertex are defined
|
||||
#endif
|
||||
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
|
||||
? block->bbox[d] + patch->lli[d] * DH[d]
|
||||
: block->bbox[d] + (ghost_width - 0.5) * DH[d];
|
||||
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
|
||||
? block->bbox[dim + d] - patch->uui[d] * DH[d]
|
||||
: block->bbox[dim + d] - (ghost_width - 0.5) * DH[d];
|
||||
#else
|
||||
#ifdef Cell
|
||||
llb = (feq(block->bbox[d], patch->bbox[d], DH[d] / 2))
|
||||
? block->bbox[d] + patch->lli[d] * DH[d]
|
||||
: block->bbox[d] + ghost_width * DH[d];
|
||||
uub = (feq(block->bbox[dim + d], patch->bbox[dim + d], DH[d] / 2))
|
||||
? block->bbox[dim + d] - patch->uui[d] * DH[d]
|
||||
: block->bbox[dim + d] - ghost_width * DH[d];
|
||||
#else
|
||||
#error Not define Vertex nor Cell
|
||||
#endif
|
||||
#endif
|
||||
if (point[d] - llb < -DH[d] / 2 || point[d] - uub > DH[d] / 2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
int z4c_cuda_interp_tile_start(const double *coords, int n, double x, double dx, int ordn)
|
||||
{
|
||||
if (!coords || n <= ordn)
|
||||
return 0;
|
||||
int cxi = int((x - coords[0]) / dx + 0.4) + 1;
|
||||
int start = cxi - ordn / 2;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
const int max_start = n - ordn;
|
||||
if (start > max_start)
|
||||
start = max_start;
|
||||
return start;
|
||||
}
|
||||
|
||||
bool z4c_cuda_interp_bh_point_resident(MyList<Patch> *PatL,
|
||||
int myrank,
|
||||
const double *point,
|
||||
var *forx, var *fory, var *forz,
|
||||
int Symmetry,
|
||||
double *shellf)
|
||||
{
|
||||
const int ordn = 2 * ghost_width;
|
||||
int owner_rank = -1;
|
||||
|
||||
shellf[0] = shellf[1] = shellf[2] = 0.0;
|
||||
|
||||
MyList<Patch> *PL = PatL;
|
||||
while (PL)
|
||||
{
|
||||
Patch *patch = PL->data;
|
||||
if (!z4c_cuda_patch_contains_point(patch, point))
|
||||
{
|
||||
PL = PL->next;
|
||||
continue;
|
||||
}
|
||||
|
||||
double DH[dim];
|
||||
for (int d = 0; d < dim; d++)
|
||||
DH[d] = patch->getdX(d);
|
||||
|
||||
MyList<Block> *BP = patch->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *block = BP->data;
|
||||
if (z4c_cuda_point_in_block(patch, block, point, DH))
|
||||
{
|
||||
owner_rank = block->rank;
|
||||
if (myrank == owner_rank)
|
||||
{
|
||||
int interp_ordn = ordn;
|
||||
int interp_sym = Symmetry;
|
||||
double x = point[0];
|
||||
double y = point[1];
|
||||
double z = point[2];
|
||||
|
||||
if (z4c_cuda_has_resident_state(block) &&
|
||||
block->shape[0] >= ordn && block->shape[1] >= ordn && block->shape[2] >= ordn)
|
||||
{
|
||||
const int sx = ordn;
|
||||
const int sy = ordn;
|
||||
const int sz = ordn;
|
||||
const int region_all = sx * sy * sz;
|
||||
const int i0 = z4c_cuda_interp_tile_start(block->X[0], block->shape[0], x, DH[0], ordn);
|
||||
const int j0 = z4c_cuda_interp_tile_start(block->X[1], block->shape[1], y, DH[1], ordn);
|
||||
const int k0 = z4c_cuda_interp_tile_start(block->X[2], block->shape[2], z, DH[2], ordn);
|
||||
double *packed_fields = new double[3 * region_all];
|
||||
var *vars[3] = {forx, fory, forz};
|
||||
for (int f = 0; f < 3; f++)
|
||||
{
|
||||
if (z4c_cuda_pack_state_region_to_host_buffer(block,
|
||||
k_z4c_cuda_bh_state_indices[f],
|
||||
packed_fields + f * region_all,
|
||||
block->shape,
|
||||
i0, j0, k0,
|
||||
sx, sy, sz) != 0)
|
||||
{
|
||||
delete[] packed_fields;
|
||||
cout << "CUDA Z4C BH tile download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int tile_shape[3] = {sx, sy, sz};
|
||||
f_global_interp(tile_shape,
|
||||
block->X[0] + i0,
|
||||
block->X[1] + j0,
|
||||
block->X[2] + k0,
|
||||
packed_fields + f * region_all,
|
||||
shellf[f],
|
||||
x, y, z,
|
||||
interp_ordn,
|
||||
vars[f]->SoA,
|
||||
interp_sym);
|
||||
}
|
||||
delete[] packed_fields;
|
||||
}
|
||||
else
|
||||
{
|
||||
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||
block->fgfs[forx->sgfn], shellf[0],
|
||||
x, y, z, interp_ordn, forx->SoA, interp_sym);
|
||||
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||
block->fgfs[fory->sgfn], shellf[1],
|
||||
x, y, z, interp_ordn, fory->SoA, interp_sym);
|
||||
f_global_interp(block->shape, block->X[0], block->X[1], block->X[2],
|
||||
block->fgfs[forz->sgfn], shellf[2],
|
||||
x, y, z, interp_ordn, forz->SoA, interp_sym);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (BP == patch->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
|
||||
if (owner_rank >= 0)
|
||||
break;
|
||||
PL = PL->next;
|
||||
}
|
||||
|
||||
if (owner_rank < 0)
|
||||
return false;
|
||||
|
||||
MPI_Bcast(shellf, 3, MPI_DOUBLE, owner_rank, MPI_COMM_WORLD);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool z4c_cuda_compute_porg_rhs_resident(cgh *GH,
|
||||
int ilev,
|
||||
int myrank,
|
||||
int BH_num,
|
||||
double **BH_PS,
|
||||
double **BH_RHS,
|
||||
var *forx, var *fory, var *forz,
|
||||
int Symmetry)
|
||||
{
|
||||
for (int n = 0; n < BH_num; n++)
|
||||
{
|
||||
double shellf[3] = {0.0, 0.0, 0.0};
|
||||
int lev = ilev;
|
||||
while (lev >= 0 &&
|
||||
!z4c_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n],
|
||||
forx, fory, forz, Symmetry, shellf))
|
||||
{
|
||||
--lev;
|
||||
}
|
||||
if (lev < 0)
|
||||
return false;
|
||||
BH_RHS[n][0] = -shellf[0];
|
||||
BH_RHS[n][1] = -shellf[1];
|
||||
BH_RHS[n][2] = -shellf[2];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
#endif
|
||||
|
||||
void Z4c_class::Step(int lev, int YN)
|
||||
{
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||
#ifdef With_AHF
|
||||
AH_Step_Find(lev, dT_lev);
|
||||
#endif
|
||||
bool BB = fgt(PhysTime, StartTime, dT_lev / 2);
|
||||
double ndeps = numepss;
|
||||
if (lev < GH->movls)
|
||||
ndeps = numepsb;
|
||||
double TRK4 = PhysTime;
|
||||
int iter_count = 0;
|
||||
int pre = 0, cor = 1;
|
||||
int ERROR = 0;
|
||||
|
||||
MyList<Patch> *Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
double *state_in[Z4C_CUDA_STATE_COUNT];
|
||||
double *state_out[Z4C_CUDA_STATE_COUNT];
|
||||
double propspeed[Z4C_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
||||
if (!fill_z4c_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
|
||||
!fill_z4c_cuda_views(cg, SynchList_pre, state_out))
|
||||
{
|
||||
cout << "CUDA Z4C state list mismatch on predictor step" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int apply_bam_bc = 0;
|
||||
#if (MRBD == 0)
|
||||
#if (SommerType == 0)
|
||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||
#endif
|
||||
#elif (MRBD == 1)
|
||||
apply_bam_bc = 1;
|
||||
#endif
|
||||
int keep_resident_state = 1;
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
apply_enforce_ga = 1;
|
||||
#endif
|
||||
if (z4c_cuda_rk4_substep(cg,
|
||||
cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
state_in, state_out,
|
||||
propspeed, soa_flat, Pp->data->bbox,
|
||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, pre,
|
||||
keep_resident_state, apply_enforce_ga, chitiny))
|
||||
{
|
||||
cout << "CUDA Z4C predictor substep failed in domain: ("
|
||||
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
if (myrank == 0 && ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "CUDA Z4C failed in predictor at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_pre, Symmetry);
|
||||
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
|
||||
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
|
||||
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
|
||||
if (Symmetry > 0)
|
||||
Porg[ithBH][2] = fabs(Porg[ithBH][2]);
|
||||
if (Symmetry == 2)
|
||||
{
|
||||
Porg[ithBH][0] = fabs(Porg[ithBH][0]);
|
||||
Porg[ithBH][1] = fabs(Porg[ithBH][1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ((lev == a_lev) && (LastAnas + dT_lev >= AnasTime))
|
||||
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_pre, myrank, false);
|
||||
if (lev == a_lev)
|
||||
AnalysisStuff(lev, dT_lev);
|
||||
|
||||
for (iter_count = 1; iter_count < 4; iter_count++)
|
||||
{
|
||||
if (iter_count == 1 || iter_count == 3)
|
||||
TRK4 += dT_lev / 2;
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank)
|
||||
{
|
||||
double *state_in[Z4C_CUDA_STATE_COUNT];
|
||||
double *state_out[Z4C_CUDA_STATE_COUNT];
|
||||
double propspeed[Z4C_CUDA_STATE_COUNT];
|
||||
double soa_flat[3 * Z4C_CUDA_STATE_COUNT];
|
||||
if (!fill_z4c_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
|
||||
!fill_z4c_cuda_views(cg, SynchList_cor, state_out))
|
||||
{
|
||||
cout << "CUDA Z4C state list mismatch on corrector step" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
int apply_bam_bc = 0;
|
||||
#if (MRBD == 0)
|
||||
#if (SommerType == 0)
|
||||
apply_bam_bc = (lev == 0) ? 1 : 0;
|
||||
#endif
|
||||
#elif (MRBD == 1)
|
||||
apply_bam_bc = 1;
|
||||
#endif
|
||||
int keep_resident_state = 1;
|
||||
int apply_enforce_ga = 0;
|
||||
#if (AGM == 0)
|
||||
apply_enforce_ga = 1;
|
||||
#elif (AGM == 1)
|
||||
apply_enforce_ga = (iter_count == 3) ? 1 : 0;
|
||||
#endif
|
||||
if (z4c_cuda_rk4_substep(cg,
|
||||
cg->shape, cg->X[0], cg->X[1], cg->X[2],
|
||||
state_in, state_out,
|
||||
propspeed, soa_flat, Pp->data->bbox,
|
||||
dT_lev, TRK4, iter_count, apply_bam_bc,
|
||||
Symmetry, lev, ndeps, cor,
|
||||
keep_resident_state, apply_enforce_ga, chitiny))
|
||||
{
|
||||
cout << "CUDA Z4C corrector substep failed in domain: ("
|
||||
<< cg->bbox[0] << ":" << cg->bbox[3] << ","
|
||||
<< cg->bbox[1] << ":" << cg->bbox[4] << ","
|
||||
<< cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
|
||||
ERROR = 1;
|
||||
}
|
||||
}
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
|
||||
{
|
||||
int erh = ERROR;
|
||||
MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
|
||||
}
|
||||
if (ERROR)
|
||||
{
|
||||
if (myrank == 0 && ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "CUDA Z4C failed in RK4 substep#" << iter_count
|
||||
<< " at t = " << PhysTime
|
||||
<< ", lev = " << lev << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
|
||||
Parallel::Sync(GH->PatL[lev], SynchList_cor, Symmetry);
|
||||
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
if (!z4c_cuda_compute_porg_rhs_resident(GH, lev, myrank, BH_num,
|
||||
Porg, Porg1,
|
||||
Sfx, Sfy, Sfz, Symmetry))
|
||||
{
|
||||
if (myrank == 0 && ErrorMonitor->outfile)
|
||||
ErrorMonitor->outfile << "CUDA Z4C failed to interpolate black-hole shift at t = "
|
||||
<< PhysTime << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
}
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
|
||||
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
|
||||
f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
|
||||
if (Symmetry > 0)
|
||||
Porg1[ithBH][2] = fabs(Porg1[ithBH][2]);
|
||||
if (Symmetry == 2)
|
||||
{
|
||||
Porg1[ithBH][0] = fabs(Porg1[ithBH][0]);
|
||||
Porg1[ithBH][1] = fabs(Porg1[ithBH][1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (iter_count < 3)
|
||||
{
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
cg->swapList(SynchList_pre, SynchList_cor, myrank);
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
Porg[ithBH][0] = Porg1[ithBH][0];
|
||||
Porg[ithBH][1] = Porg1[ithBH][1];
|
||||
Porg[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
z4c_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
|
||||
|
||||
#if (RPS == 0)
|
||||
RestrictProlong(lev, YN, BB);
|
||||
#endif
|
||||
|
||||
Pp = GH->PatL[lev];
|
||||
while (Pp)
|
||||
{
|
||||
MyList<Block> *BP = Pp->data->blb;
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
cg->swapList(StateList, SynchList_cor, myrank);
|
||||
cg->swapList(OldStateList, SynchList_cor, myrank);
|
||||
if (BP == Pp->data->ble)
|
||||
break;
|
||||
BP = BP->next;
|
||||
}
|
||||
Pp = Pp->next;
|
||||
}
|
||||
if (BH_num > 0 && lev == GH->levels - 1)
|
||||
{
|
||||
for (int ithBH = 0; ithBH < BH_num; ithBH++)
|
||||
{
|
||||
Porg0[ithBH][0] = Porg1[ithBH][0];
|
||||
Porg0[ithBH][1] = Porg1[ithBH][1];
|
||||
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
#else
|
||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||
#ifndef CPBC
|
||||
// for sommerfeld boundary
|
||||
|
||||
void Z4c_class::Step(int lev, int YN)
|
||||
{
|
||||
double dT_lev = dT * pow(0.5, Mymax(lev, trfls));
|
||||
#ifdef With_AHF
|
||||
AH_Step_Find(lev, dT_lev);
|
||||
#endif
|
||||
@@ -1589,19 +1039,15 @@ void Z4c_class::Step(int lev, int YN)
|
||||
{
|
||||
Porg0[ithBH][0] = Porg1[ithBH][0];
|
||||
Porg0[ithBH][1] = Porg1[ithBH][1];
|
||||
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
// for constraint preserving boundary (CPBC)
|
||||
#if USE_CUDA_Z4C && (ABEtype == 2)
|
||||
#error "USE_CUDA_Z4C resident path does not support CPBC"
|
||||
#endif
|
||||
#ifndef WithShell
|
||||
#error "CPBC only supports Shell"
|
||||
#endif
|
||||
Porg0[ithBH][2] = Porg1[ithBH][2];
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
// for constraint preserving boundary (CPBC)
|
||||
#ifndef WithShell
|
||||
#error "CPBC only supports Shell"
|
||||
#endif
|
||||
|
||||
// 0: extroplate rhs, 1: extroplate variable
|
||||
// 2: extroplate variable but before RHS calculation
|
||||
|
||||
@@ -94,31 +94,29 @@
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry,Lev,eps,co)
|
||||
|
||||
if (co == 0) then
|
||||
#if (ABV == 0)
|
||||
call ricci_gamma(ex, X, Y, Z, &
|
||||
chi, &
|
||||
dxx , gxy , gxz , dyy , gyz , dzz,&
|
||||
Gamx , Gamy , Gamz , &
|
||||
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
|
||||
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
|
||||
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
|
||||
Symmetry)
|
||||
#endif
|
||||
call constraint_bssn(ex, X, Y, Z,&
|
||||
chi,trK, &
|
||||
dxx,gxy,gxz,dyy,gyz,dzz, &
|
||||
Axx,Axy,Axz,Ayy,Ayz,Azz, &
|
||||
Gamx,Gamy,Gamz,&
|
||||
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
|
||||
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
|
||||
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
|
||||
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry)
|
||||
endif
|
||||
#if (ABV == 0)
|
||||
call ricci_gamma(ex, X, Y, Z, &
|
||||
chi, &
|
||||
dxx , gxy , gxz , dyy , gyz , dzz,&
|
||||
Gamx , Gamy , Gamz , &
|
||||
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
|
||||
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
|
||||
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
|
||||
Symmetry)
|
||||
#endif
|
||||
call constraint_bssn(ex, X, Y, Z,&
|
||||
chi,trK, &
|
||||
dxx,gxy,gxz,dyy,gyz,dzz, &
|
||||
Axx,Axy,Axz,Ayy,Ayz,Azz, &
|
||||
Gamx,Gamy,Gamz,&
|
||||
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
|
||||
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
|
||||
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
|
||||
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry)
|
||||
|
||||
return
|
||||
|
||||
@@ -228,12 +226,11 @@
|
||||
|
||||
call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
|
||||
|
||||
!!! sanity check
|
||||
#ifdef DEBUG
|
||||
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
|
||||
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
|
||||
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
|
||||
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
|
||||
!!! sanity check
|
||||
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
|
||||
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
|
||||
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
|
||||
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
|
||||
+sum(TZ)
|
||||
if(dX.ne.dX) then
|
||||
if(sum(chi).ne.sum(chi))write(*,*)"Z4c_rhs.f90: find NaN in chi"
|
||||
@@ -260,11 +257,10 @@
|
||||
if(sum(dtSfx).ne.sum(dtSfx))write(*,*)"Z4c_rhs.f90: find NaN in dtSfx"
|
||||
if(sum(dtSfy).ne.sum(dtSfy))write(*,*)"Z4c_rhs.f90: find NaN in dtSfy"
|
||||
if(sum(dtSfz).ne.sum(dtSfz))write(*,*)"Z4c_rhs.f90: find NaN in dtSfz"
|
||||
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs.f90: find NaN in TZ"
|
||||
gont = 1
|
||||
return
|
||||
endif
|
||||
#endif
|
||||
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs.f90: find NaN in TZ"
|
||||
gont = 1
|
||||
return
|
||||
endif
|
||||
|
||||
PI = dacos(-ONE)
|
||||
|
||||
@@ -1267,32 +1263,30 @@
|
||||
|
||||
endif
|
||||
|
||||
if (co == 0) then
|
||||
#if (ABV == 0)
|
||||
call ricci_gamma(ex, X, Y, Z, &
|
||||
chi, &
|
||||
dxx , gxy , gxz , dyy , gyz , dzz,&
|
||||
Gamx , Gamy , Gamz , &
|
||||
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
|
||||
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
|
||||
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
|
||||
Symmetry)
|
||||
#endif
|
||||
|
||||
call constraint_bssn(ex, X, Y, Z,&
|
||||
chi,trK, &
|
||||
dxx,gxy,gxz,dyy,gyz,dzz, &
|
||||
Axx,Axy,Axz,Ayy,Ayz,Azz, &
|
||||
Gamx,Gamy,Gamz,&
|
||||
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
|
||||
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
|
||||
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
|
||||
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry)
|
||||
endif
|
||||
#if (ABV == 0)
|
||||
call ricci_gamma(ex, X, Y, Z, &
|
||||
chi, &
|
||||
dxx , gxy , gxz , dyy , gyz , dzz,&
|
||||
Gamx , Gamy , Gamz , &
|
||||
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
|
||||
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
|
||||
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
|
||||
Symmetry)
|
||||
#endif
|
||||
|
||||
call constraint_bssn(ex, X, Y, Z,&
|
||||
chi,trK, &
|
||||
dxx,gxy,gxz,dyy,gyz,dzz, &
|
||||
Axx,Axy,Axz,Ayy,Ayz,Azz, &
|
||||
Gamx,Gamy,Gamz,&
|
||||
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
|
||||
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
|
||||
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
|
||||
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry)
|
||||
|
||||
gont = 0
|
||||
|
||||
|
||||
@@ -121,12 +121,11 @@
|
||||
|
||||
call get_Z4cparameters(kappa1,kappa2,kappa3,FF,eta)
|
||||
|
||||
!!! sanity check
|
||||
#ifdef DEBUG
|
||||
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
|
||||
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
|
||||
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
|
||||
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
|
||||
!!! sanity check
|
||||
dX = sum(chi)+sum(trK)+sum(dxx)+sum(gxy)+sum(gxz)+sum(dyy)+sum(gyz)+sum(dzz) &
|
||||
+sum(Axx)+sum(Axy)+sum(Axz)+sum(Ayy)+sum(Ayz)+sum(Azz) &
|
||||
+sum(Gamx)+sum(Gamy)+sum(Gamz) &
|
||||
+sum(Lap)+sum(betax)+sum(betay)+sum(betaz)+sum(dtSfx)+sum(dtSfy)+sum(dtSfz) &
|
||||
+sum(TZ)
|
||||
if(dX.ne.dX) then
|
||||
if(sum(chi).ne.sum(chi))write(*,*)"Z4c_rhs_ss.f90: find NaN in chi"
|
||||
@@ -153,11 +152,10 @@
|
||||
if(sum(dtSfx).ne.sum(dtSfx))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfx"
|
||||
if(sum(dtSfy).ne.sum(dtSfy))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfy"
|
||||
if(sum(dtSfz).ne.sum(dtSfz))write(*,*)"Z4c_rhs_ss.f90: find NaN in dtSfz"
|
||||
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs_ss.f90: find NaN in TZ"
|
||||
gont = 1
|
||||
return
|
||||
endif
|
||||
#endif
|
||||
if(sum(TZ).ne.sum(Tz))write(*,*)"Z4c_rhs_ss.f90: find NaN in TZ"
|
||||
gont = 1
|
||||
return
|
||||
endif
|
||||
|
||||
PI = dacos(-ONE)
|
||||
|
||||
@@ -1390,43 +1388,41 @@
|
||||
call kodis_sh(ex,crho,sigma,R,TZ,TZ_rhs,SSS,Symmetry,eps,sst)
|
||||
endif
|
||||
|
||||
if (co == 0) then
|
||||
#if (ABV == 1)
|
||||
call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z, &
|
||||
drhodx, drhody, drhodz, &
|
||||
dsigmadx,dsigmady,dsigmadz, &
|
||||
dRdx,dRdy,dRdz, &
|
||||
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
|
||||
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
|
||||
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
|
||||
chi, &
|
||||
dxx , gxy , gxz , dyy , gyz , dzz,&
|
||||
Gamx , Gamy , Gamz , &
|
||||
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
|
||||
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
|
||||
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
|
||||
Symmetry,Lev,sst)
|
||||
#endif
|
||||
call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z, &
|
||||
drhodx, drhody, drhodz, &
|
||||
dsigmadx,dsigmady,dsigmadz, &
|
||||
dRdx,dRdy,dRdz, &
|
||||
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
|
||||
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
|
||||
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
|
||||
chi,trK, &
|
||||
dxx,gxy,gxz,dyy,gyz,dzz, &
|
||||
Axx,Axy,Axz,Ayy,Ayz,Azz, &
|
||||
Gamx,Gamy,Gamz,&
|
||||
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
|
||||
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
|
||||
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
|
||||
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry,Lev,sst)
|
||||
endif
|
||||
#if (ABV == 1)
|
||||
call ricci_gamma_ss(ex,crho,sigma,R,X, Y, Z, &
|
||||
drhodx, drhody, drhodz, &
|
||||
dsigmadx,dsigmady,dsigmadz, &
|
||||
dRdx,dRdy,dRdz, &
|
||||
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
|
||||
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
|
||||
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
|
||||
chi, &
|
||||
dxx , gxy , gxz , dyy , gyz , dzz,&
|
||||
Gamx , Gamy , Gamz , &
|
||||
Gamxxx,Gamxxy,Gamxxz,Gamxyy,Gamxyz,Gamxzz,&
|
||||
Gamyxx,Gamyxy,Gamyxz,Gamyyy,Gamyyz,Gamyzz,&
|
||||
Gamzxx,Gamzxy,Gamzxz,Gamzyy,Gamzyz,Gamzzz,&
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz,&
|
||||
Symmetry,Lev,sst)
|
||||
call constraint_bssn_ss(ex,crho,sigma,R,X, Y, Z, &
|
||||
drhodx, drhody, drhodz, &
|
||||
dsigmadx,dsigmady,dsigmadz, &
|
||||
dRdx,dRdy,dRdz, &
|
||||
drhodxx,drhodxy,drhodxz,drhodyy,drhodyz,drhodzz, &
|
||||
dsigmadxx,dsigmadxy,dsigmadxz,dsigmadyy,dsigmadyz,dsigmadzz, &
|
||||
dRdxx,dRdxy,dRdxz,dRdyy,dRdyz,dRdzz, &
|
||||
chi,trK, &
|
||||
dxx,gxy,gxz,dyy,gyz,dzz, &
|
||||
Axx,Axy,Axz,Ayy,Ayz,Azz, &
|
||||
Gamx,Gamy,Gamz,&
|
||||
Lap,betax,betay,betaz,rho,Sx,Sy,Sz,&
|
||||
Gamxxx, Gamxxy, Gamxxz,Gamxyy, Gamxyz, Gamxzz, &
|
||||
Gamyxx, Gamyxy, Gamyxz,Gamyyy, Gamyyz, Gamyzz, &
|
||||
Gamzxx, Gamzxy, Gamzxz,Gamzyy, Gamzyz, Gamzzz, &
|
||||
Rxx,Rxy,Rxz,Ryy,Ryz,Rzz, &
|
||||
Hcon,Mxcon,Mycon,Mzcon,Gmxcon,Gmycon,Gmzcon, &
|
||||
Symmetry,Lev,sst)
|
||||
#endif
|
||||
|
||||
gont = 0
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -48,7 +48,6 @@ public:
|
||||
double StartTime, TotalTime;
|
||||
double AnasTime, DumpTime, d2DumpTime, CheckTime;
|
||||
double LastAnas, LastConsOut;
|
||||
bool cuda_level0_constraint_cache_valid;
|
||||
int *ConstraintRefreshLevels;
|
||||
double Courant;
|
||||
double numepss, numepsb, numepsh;
|
||||
|
||||
2908
AMSS_NCKU_source/bssn_gpu.cu
Normal file
2908
AMSS_NCKU_source/bssn_gpu.cu
Normal file
File diff suppressed because it is too large
Load Diff
73
AMSS_NCKU_source/bssn_gpu.h
Normal file
73
AMSS_NCKU_source/bssn_gpu.h
Normal file
@@ -0,0 +1,73 @@
|
||||
|
||||
#ifndef BSSN_GPU_H_
|
||||
#define BSSN_GPU_H_
|
||||
#include "bssn_macro.h"
|
||||
#include "macrodef.fh"
|
||||
|
||||
#define DEVICE_ID 0
|
||||
// #define DEVICE_ID_BY_MPI_RANK
|
||||
#define GRID_DIM 256
|
||||
#define BLOCK_DIM 128
|
||||
|
||||
#define _FH2_(i, j, k) fh[(i) + (j) * _1D_SIZE[2] + (k) * _2D_SIZE[2]]
|
||||
#define _FH3_(i, j, k) fh[(i) + (j) * _1D_SIZE[3] + (k) * _2D_SIZE[3]]
|
||||
#define pow2(x) ((x) * (x))
|
||||
#define TimeBetween(a, b) ((b.tv_sec - a.tv_sec) + (b.tv_usec - a.tv_usec) / 1000000.0f)
|
||||
#define M_ metac.
|
||||
#define Mh_ meta->
|
||||
#define Ms_ metassc.
|
||||
#define Msh_ metass->
|
||||
|
||||
// #define TIMING
|
||||
|
||||
#define RHS_SS_PARA int calledby, int mpi_rank, int *ex, double &T, double *crho, double *sigma, double *R, double *X, double *Y, double *Z, double *drhodx, double *drhody, double *drhodz, double *dsigmadx, double *dsigmady, double *dsigmadz, double *dRdx, double *dRdy, double *dRdz, double *drhodxx, double *drhodxy, double *drhodxz, double *drhodyy, double *drhodyz, double *drhodzz, double *dsigmadxx, double *dsigmadxy, double *dsigmadxz, double *dsigmadyy, double *dsigmadyz, double *dsigmadzz, double *dRdxx, double *dRdxy, double *dRdxz, double *dRdyy, double *dRdyz, double *dRdzz, double *chi, double *trK, double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz, double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz, double *Gamx, double *Gamy, double *Gamz, double *Lap, double *betax, double *betay, double *betaz, double *dtSfx, double *dtSfy, double *dtSfz, double *chi_rhs, double *trK_rhs, double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs, double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs, double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs, double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs, double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs, double *rho, double *Sx, double *Sy, double *Sz, double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz, double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz, double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz, double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz, double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz, double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res, double *Gmx_Res, double *Gmy_Res, double *Gmz_Res, int &Symmetry, int &Lev, double &eps, int &sst, int &co
|
||||
|
||||
/** main function */
|
||||
int gpu_rhs(int calledby, int mpi_rank, int *ex, double &T,
|
||||
double *X, double *Y, double *Z,
|
||||
|
||||
double *chi, double *trK,
|
||||
|
||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
||||
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
|
||||
double *rho, double *Sx, double *Sy, double *Sz, double *Sxx,
|
||||
double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
|
||||
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &co);
|
||||
|
||||
int gpu_rhs_ss(RHS_SS_PARA);
|
||||
|
||||
/** Init GPU side data in GPUMeta. */
|
||||
// void init_fluid_meta_gpu(GPUMeta *gpu_meta);
|
||||
|
||||
#endif
|
||||
7790
AMSS_NCKU_source/bssn_gpu_class.C
Normal file
7790
AMSS_NCKU_source/bssn_gpu_class.C
Normal file
File diff suppressed because it is too large
Load Diff
210
AMSS_NCKU_source/bssn_gpu_class.h
Normal file
210
AMSS_NCKU_source/bssn_gpu_class.h
Normal file
@@ -0,0 +1,210 @@
|
||||
|
||||
#ifndef BSSN_GPU_CLASS_H
|
||||
#define BSSN_GPU_CLASS_H
|
||||
|
||||
#ifdef newc
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <fstream>
|
||||
#include <cstdlib>
|
||||
#include <string>
|
||||
#include <cmath>
|
||||
using namespace std;
|
||||
#else
|
||||
#include <iostream.h>
|
||||
#include <iomanip.h>
|
||||
#include <fstream.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#include <mpi.h>
|
||||
|
||||
#include "macrodef.h"
|
||||
#include "cgh.h"
|
||||
#include "ShellPatch.h"
|
||||
#include "misc.h"
|
||||
#include "var.h"
|
||||
#include "MyList.h"
|
||||
#include "monitor.h"
|
||||
#include "surface_integral.h"
|
||||
#include "checkpoint.h"
|
||||
|
||||
// added by yangquan
|
||||
#include "bssn_macro.h"
|
||||
|
||||
extern void setpbh(int iBHN, double **iPBH, double *iMass, int rBHN);
|
||||
|
||||
class bssn_class
|
||||
{
|
||||
public:
|
||||
// added by yangquan
|
||||
//----------------------
|
||||
int gpu_num_mynode;
|
||||
int cpu_core_num_mynode;
|
||||
int mpi_process_num_mynode;
|
||||
int my_sequence_mynode;
|
||||
int mynode_id;
|
||||
int use_gpu;
|
||||
|
||||
virtual void Step_GPU(int lev, int YN);
|
||||
virtual void Get_runtime_envirment();
|
||||
// virtual void Step_OPENMP(int lev,int YN);
|
||||
//----------------------
|
||||
|
||||
int ngfs;
|
||||
int nprocs, myrank;
|
||||
cgh *GH;
|
||||
ShellPatch *SH;
|
||||
double PhysTime;
|
||||
|
||||
int checkrun;
|
||||
char checkfilename[50];
|
||||
int Steps;
|
||||
double StartTime, TotalTime;
|
||||
double AnasTime, DumpTime, d2DumpTime, CheckTime;
|
||||
double LastAnas, LastConsOut;
|
||||
double Courant;
|
||||
double numepss, numepsb, numepsh;
|
||||
int Symmetry;
|
||||
int maxl, decn;
|
||||
double maxrex, drex;
|
||||
int trfls, a_lev;
|
||||
|
||||
double dT;
|
||||
double chitiny;
|
||||
|
||||
double **Porg0, **Porgbr, **Porg, **Porg1, **Porg_rhs;
|
||||
int BH_num, BH_num_input;
|
||||
double *Mass, *Pmom, *Spin;
|
||||
double ADMMass;
|
||||
|
||||
var *phio, *trKo;
|
||||
var *gxxo, *gxyo, *gxzo, *gyyo, *gyzo, *gzzo;
|
||||
var *Axxo, *Axyo, *Axzo, *Ayyo, *Ayzo, *Azzo;
|
||||
var *Gmxo, *Gmyo, *Gmzo;
|
||||
var *Lapo, *Sfxo, *Sfyo, *Sfzo;
|
||||
var *dtSfxo, *dtSfyo, *dtSfzo;
|
||||
|
||||
var *phi0, *trK0;
|
||||
var *gxx0, *gxy0, *gxz0, *gyy0, *gyz0, *gzz0;
|
||||
var *Axx0, *Axy0, *Axz0, *Ayy0, *Ayz0, *Azz0;
|
||||
var *Gmx0, *Gmy0, *Gmz0;
|
||||
var *Lap0, *Sfx0, *Sfy0, *Sfz0;
|
||||
var *dtSfx0, *dtSfy0, *dtSfz0;
|
||||
|
||||
var *phi, *trK;
|
||||
var *gxx, *gxy, *gxz, *gyy, *gyz, *gzz;
|
||||
var *Axx, *Axy, *Axz, *Ayy, *Ayz, *Azz;
|
||||
var *Gmx, *Gmy, *Gmz;
|
||||
var *Lap, *Sfx, *Sfy, *Sfz;
|
||||
var *dtSfx, *dtSfy, *dtSfz;
|
||||
|
||||
var *phi1, *trK1;
|
||||
var *gxx1, *gxy1, *gxz1, *gyy1, *gyz1, *gzz1;
|
||||
var *Axx1, *Axy1, *Axz1, *Ayy1, *Ayz1, *Azz1;
|
||||
var *Gmx1, *Gmy1, *Gmz1;
|
||||
var *Lap1, *Sfx1, *Sfy1, *Sfz1;
|
||||
var *dtSfx1, *dtSfy1, *dtSfz1;
|
||||
|
||||
var *phi_rhs, *trK_rhs;
|
||||
var *gxx_rhs, *gxy_rhs, *gxz_rhs, *gyy_rhs, *gyz_rhs, *gzz_rhs;
|
||||
var *Axx_rhs, *Axy_rhs, *Axz_rhs, *Ayy_rhs, *Ayz_rhs, *Azz_rhs;
|
||||
var *Gmx_rhs, *Gmy_rhs, *Gmz_rhs;
|
||||
var *Lap_rhs, *Sfx_rhs, *Sfy_rhs, *Sfz_rhs;
|
||||
var *dtSfx_rhs, *dtSfy_rhs, *dtSfz_rhs;
|
||||
|
||||
var *rho, *Sx, *Sy, *Sz, *Sxx, *Sxy, *Sxz, *Syy, *Syz, *Szz;
|
||||
|
||||
var *Gamxxx, *Gamxxy, *Gamxxz, *Gamxyy, *Gamxyz, *Gamxzz;
|
||||
var *Gamyxx, *Gamyxy, *Gamyxz, *Gamyyy, *Gamyyz, *Gamyzz;
|
||||
var *Gamzxx, *Gamzxy, *Gamzxz, *Gamzyy, *Gamzyz, *Gamzzz;
|
||||
|
||||
var *Rxx, *Rxy, *Rxz, *Ryy, *Ryz, *Rzz;
|
||||
|
||||
var *Rpsi4, *Ipsi4;
|
||||
var *t1Rpsi4, *t1Ipsi4, *t2Rpsi4, *t2Ipsi4;
|
||||
|
||||
var *Cons_Ham, *Cons_Px, *Cons_Py, *Cons_Pz, *Cons_Gx, *Cons_Gy, *Cons_Gz;
|
||||
|
||||
#ifdef Point_Psi4
|
||||
var *phix, *phiy, *phiz;
|
||||
var *trKx, *trKy, *trKz;
|
||||
var *Axxx, *Axxy, *Axxz;
|
||||
var *Axyx, *Axyy, *Axyz;
|
||||
var *Axzx, *Axzy, *Axzz;
|
||||
var *Ayyx, *Ayyy, *Ayyz;
|
||||
var *Ayzx, *Ayzy, *Ayzz;
|
||||
var *Azzx, *Azzy, *Azzz;
|
||||
#endif
|
||||
// FIXME: uc = StateList, up = OldStateList, upp = SynchList_cor; so never touch these three data
|
||||
MyList<var> *StateList, *SynchList_pre, *SynchList_cor, *RHSList;
|
||||
MyList<var> *OldStateList, *DumpList;
|
||||
MyList<var> *ConstraintList;
|
||||
|
||||
monitor *ErrorMonitor, *Psi4Monitor, *BHMonitor, *MAPMonitor;
|
||||
monitor *ConVMonitor;
|
||||
surface_integral *Waveshell;
|
||||
checkpoint *CheckPoint;
|
||||
|
||||
public:
|
||||
bssn_class(double Couranti, double StartTimei, double TotalTimei, double DumpTimei, double d2DumpTimei, double CheckTimei, double AnasTimei,
|
||||
int Symmetryi, int checkruni, char *checkfilenamei, double numepssi, double numepsbi, double numepshi,
|
||||
int a_levi, int maxli, int decni, double maxrexi, double drexi);
|
||||
~bssn_class();
|
||||
|
||||
void Evolve(int Steps);
|
||||
void RecursiveStep(int lev);
|
||||
#if (PSTR == 1)
|
||||
void ParallelStep();
|
||||
void SHStep();
|
||||
#endif
|
||||
void RestrictProlong(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
|
||||
void RestrictProlong_aux(int lev, int YN, bool BB, MyList<var> *SL, MyList<var> *OL, MyList<var> *corL);
|
||||
void RestrictProlong(int lev, int YN, bool BB);
|
||||
void ProlongRestrict(int lev, int YN, bool BB);
|
||||
void Setup_Black_Hole_position();
|
||||
void compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, var *fory, var *forz, int lev);
|
||||
bool read_Pablo_file(int *ext, double *datain, char *filename);
|
||||
void write_Pablo_file(int *ext, double xmin, double xmax, double ymin, double ymax, double zmin, double zmax,
|
||||
char *filename);
|
||||
void AnalysisStuff(int lev, double dT_lev);
|
||||
void Setup_KerrSchild();
|
||||
void Enforce_algcon(int lev, int fg);
|
||||
|
||||
void testRestrict();
|
||||
void testOutBd();
|
||||
|
||||
virtual void Setup_Initial_Data_Lousto();
|
||||
virtual void Setup_Initial_Data_Cao();
|
||||
virtual void Initialize();
|
||||
virtual void Read_Ansorg();
|
||||
virtual void Read_Pablo() {};
|
||||
virtual void Compute_Psi4(int lev);
|
||||
virtual void Step(int lev, int YN);
|
||||
virtual void Interp_Constraint(bool infg);
|
||||
virtual void Constraint_Out();
|
||||
virtual void Compute_Constraint();
|
||||
|
||||
#ifdef With_AHF
|
||||
protected:
|
||||
MyList<var> *AHList, *AHDList, *GaugeList;
|
||||
int AHfindevery;
|
||||
double AHdumptime;
|
||||
int *lastahdumpid, HN_num; // number of possible horizons
|
||||
int *findeveryl;
|
||||
double *xc, *yc, *zc, *xr, *yr, *zr;
|
||||
bool *trigger;
|
||||
double *dTT;
|
||||
int *dumpid;
|
||||
|
||||
public:
|
||||
void AH_Prepare_derivatives();
|
||||
bool AH_Interp_Points(MyList<var> *VarList,
|
||||
int NN, double **XX,
|
||||
double *Shellf, int Symmetryi);
|
||||
void AH_Step_Find(int lev, double dT_lev);
|
||||
#endif
|
||||
};
|
||||
#endif /* BSSN_GPU_CLASS_H */
|
||||
@@ -1098,12 +1098,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
betaz_rhs[i] = FF * dtSfz[i];
|
||||
|
||||
reta[i] =
|
||||
gupxx[i] * chix[i] * chix[i]
|
||||
+ gupyy[i] * chiy[i] * chiy[i]
|
||||
+ gupzz[i] * chiz[i] * chiz[i]
|
||||
+ TWO * ( gupxy[i] * chix[i] * chiy[i]
|
||||
+ gupxz[i] * chix[i] * chiz[i]
|
||||
+ gupyz[i] * chiy[i] * chiz[i] );
|
||||
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
|
||||
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
|
||||
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
|
||||
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
|
||||
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
|
||||
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
||||
|
||||
#if (GAUGE == 2)
|
||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
||||
@@ -1116,12 +1116,12 @@ int f_compute_rhs_bssn(int *ex, double &T,
|
||||
dtSfz_rhs[i] = Gamz_rhs[i] - reta[i] * dtSfz[i];
|
||||
#elif (GAUGE == 4 || GAUGE == 5)
|
||||
reta[i] =
|
||||
gupxx[i] * chix[i] * chix[i]
|
||||
+ gupyy[i] * chiy[i] * chiy[i]
|
||||
+ gupzz[i] * chiz[i] * chiz[i]
|
||||
+ TWO * ( gupxy[i] * chix[i] * chiy[i]
|
||||
+ gupxz[i] * chix[i] * chiz[i]
|
||||
+ gupyz[i] * chiy[i] * chiz[i] );
|
||||
gupxx[i] * dtSfx_rhs[i] * dtSfx_rhs[i]
|
||||
+ gupyy[i] * dtSfy_rhs[i] * dtSfy_rhs[i]
|
||||
+ gupzz[i] * dtSfz_rhs[i] * dtSfz_rhs[i]
|
||||
+ TWO * ( gupxy[i] * dtSfx_rhs[i] * dtSfy_rhs[i]
|
||||
+ gupxz[i] * dtSfx_rhs[i] * dtSfz_rhs[i]
|
||||
+ gupyz[i] * dtSfy_rhs[i] * dtSfz_rhs[i] );
|
||||
|
||||
#if (GAUGE == 4)
|
||||
reta[i] = 1.31 / 2.0 * sqrt( reta[i] / chin1[i] ) / pow( (ONE - sqrt(chin1[i])), 2.0 );
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,127 +0,0 @@
|
||||
#ifndef BSSN_RHS_CUDA_H
|
||||
#define BSSN_RHS_CUDA_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum {
|
||||
BSSN_CUDA_STATE_COUNT = 24,
|
||||
BSSN_CUDA_MATTER_COUNT = 10
|
||||
};
|
||||
|
||||
int f_compute_rhs_bssn(int *ex, double &T,
|
||||
double *X, double *Y, double *Z,
|
||||
double *chi, double *trK,
|
||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
double *rho, double *Sx, double *Sy, double *Sz,
|
||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
|
||||
double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
|
||||
int &Symmetry, int &Lev, double &eps, int &co);
|
||||
|
||||
int bssn_cuda_rk4_substep(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double **state_host_in,
|
||||
double **state_host_out,
|
||||
double **matter_host,
|
||||
const double *propspeed,
|
||||
const double *soa_flat,
|
||||
const double *bbox,
|
||||
double &dT,
|
||||
double &T,
|
||||
int &RK4,
|
||||
int &apply_bam_bc,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &co,
|
||||
int &use_zero_matter,
|
||||
int &keep_resident_state,
|
||||
int &apply_enforce_ga,
|
||||
double &chitiny);
|
||||
|
||||
int bssn_cuda_copy_state_region_to_host(void *block_tag,
|
||||
int state_index,
|
||||
double *host_state,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_copy_state_region_from_host(void *block_tag,
|
||||
int state_index,
|
||||
double *host_state,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_download_resident_state(void *block_tag,
|
||||
int *ex,
|
||||
double **state_host_out);
|
||||
|
||||
int bssn_cuda_download_constraint_outputs(int *ex,
|
||||
double **constraint_host_out);
|
||||
|
||||
int bssn_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int bssn_cuda_download_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host_out);
|
||||
|
||||
int bssn_cuda_upload_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host_in);
|
||||
|
||||
int bssn_cuda_has_resident_state(void *block_tag);
|
||||
|
||||
void bssn_cuda_release_step_ctx(void *block_tag);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
1942
AMSS_NCKU_source/bssn_step_gpu.C
Normal file
1942
AMSS_NCKU_source/bssn_step_gpu.C
Normal file
File diff suppressed because it is too large
Load Diff
@@ -17,68 +17,106 @@ using namespace std;
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
// Intel oneMKL LAPACK interface
|
||||
#include <mkl_lapacke.h>
|
||||
/* Linear equation solution using Intel oneMKL LAPACK.
|
||||
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
||||
containing the right-hand side vectors. On output a is
|
||||
replaced by its matrix inverse, and b is replaced by the
|
||||
corresponding set of solution vectors.
|
||||
|
||||
Mathematical equivalence:
|
||||
Solves: A * x = b => x = A^(-1) * b
|
||||
Original Gauss-Jordan and LAPACK dgesv/dgetri produce identical results
|
||||
within numerical precision. */
|
||||
|
||||
int gaussj(double *a, double *b, int n)
|
||||
{
|
||||
// Allocate pivot array and workspace
|
||||
lapack_int *ipiv = new lapack_int[n];
|
||||
lapack_int info;
|
||||
|
||||
// Make a copy of matrix a for solving (dgesv modifies it to LU form)
|
||||
double *a_copy = new double[n * n];
|
||||
for (int i = 0; i < n * n; i++) {
|
||||
a_copy[i] = a[i];
|
||||
}
|
||||
|
||||
// Step 1: Solve linear system A*x = b using LU decomposition
|
||||
// LAPACKE_dgesv uses column-major by default, but we use row-major
|
||||
info = LAPACKE_dgesv(LAPACK_ROW_MAJOR, n, 1, a_copy, n, ipiv, b, 1);
|
||||
|
||||
if (info != 0) {
|
||||
cout << "gaussj: Singular Matrix (dgesv info=" << info << ")" << endl;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Step 2: Compute matrix inverse A^(-1) using LU factorization
|
||||
// First do LU factorization of original matrix a
|
||||
info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, n, n, a, n, ipiv);
|
||||
|
||||
if (info != 0) {
|
||||
cout << "gaussj: Singular Matrix (dgetrf info=" << info << ")" << endl;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Then compute inverse from LU factorization
|
||||
info = LAPACKE_dgetri(LAPACK_ROW_MAJOR, n, a, n, ipiv);
|
||||
|
||||
if (info != 0) {
|
||||
cout << "gaussj: Singular Matrix (dgetri info=" << info << ")" << endl;
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
return 1;
|
||||
}
|
||||
|
||||
delete[] ipiv;
|
||||
delete[] a_copy;
|
||||
|
||||
return 0;
|
||||
}
|
||||
/* Linear equation solution by Gauss-Jordan elimination.
|
||||
a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input
|
||||
containing the right-hand side vectors. On output a is
|
||||
replaced by its matrix inverse, and b is replaced by the
|
||||
corresponding set of solution vectors. */
|
||||
|
||||
int gaussj(double *a, double *b, int n)
|
||||
{
|
||||
double swap;
|
||||
|
||||
int *indxc, *indxr, *ipiv;
|
||||
indxc = new int[n];
|
||||
indxr = new int[n];
|
||||
ipiv = new int[n];
|
||||
|
||||
int i, icol, irow, j, k, l, ll;
|
||||
double big, dum, pivinv;
|
||||
|
||||
for (j = 0; j < n; j++)
|
||||
ipiv[j] = 0;
|
||||
for (i = 0; i < n; i++)
|
||||
{
|
||||
big = 0.0;
|
||||
for (j = 0; j < n; j++)
|
||||
if (ipiv[j] != 1)
|
||||
for (k = 0; k < n; k++)
|
||||
{
|
||||
if (ipiv[k] == 0)
|
||||
{
|
||||
if (fabs(a[j * n + k]) >= big)
|
||||
{
|
||||
big = fabs(a[j * n + k]);
|
||||
irow = j;
|
||||
icol = k;
|
||||
}
|
||||
}
|
||||
else if (ipiv[k] > 1)
|
||||
{
|
||||
cout << "gaussj: Singular Matrix-1" << endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
ipiv[icol] = ipiv[icol] + 1;
|
||||
if (irow != icol)
|
||||
{
|
||||
for (l = 0; l < n; l++)
|
||||
{
|
||||
swap = a[irow * n + l];
|
||||
a[irow * n + l] = a[icol * n + l];
|
||||
a[icol * n + l] = swap;
|
||||
}
|
||||
|
||||
swap = b[irow];
|
||||
b[irow] = b[icol];
|
||||
b[icol] = swap;
|
||||
}
|
||||
|
||||
indxr[i] = irow;
|
||||
indxc[i] = icol;
|
||||
|
||||
if (a[icol * n + icol] == 0.0)
|
||||
{
|
||||
cout << "gaussj: Singular Matrix-2" << endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
pivinv = 1.0 / a[icol * n + icol];
|
||||
a[icol * n + icol] = 1.0;
|
||||
for (l = 0; l < n; l++)
|
||||
a[icol * n + l] *= pivinv;
|
||||
b[icol] *= pivinv;
|
||||
for (ll = 0; ll < n; ll++)
|
||||
if (ll != icol)
|
||||
{
|
||||
dum = a[ll * n + icol];
|
||||
a[ll * n + icol] = 0.0;
|
||||
for (l = 0; l < n; l++)
|
||||
a[ll * n + l] -= a[icol * n + l] * dum;
|
||||
b[ll] -= b[icol] * dum;
|
||||
}
|
||||
}
|
||||
|
||||
for (l = n - 1; l >= 0; l--)
|
||||
{
|
||||
if (indxr[l] != indxc[l])
|
||||
for (k = 0; k < n; k++)
|
||||
{
|
||||
swap = a[k * n + indxr[l]];
|
||||
a[k * n + indxr[l]] = a[k * n + indxc[l]];
|
||||
a[k * n + indxc[l]] = swap;
|
||||
}
|
||||
}
|
||||
|
||||
delete[] indxc;
|
||||
delete[] indxr;
|
||||
delete[] ipiv;
|
||||
|
||||
return 0;
|
||||
}
|
||||
// for check usage
|
||||
/*
|
||||
int main()
|
||||
|
||||
@@ -1,53 +1,24 @@
|
||||
|
||||
|
||||
include makefile.inc
|
||||
|
||||
## polint(ordn=6) kernel selector:
|
||||
## 1 (default): barycentric fast path
|
||||
## 0 : fallback to Neville path
|
||||
POLINT6_USE_BARY ?= 1
|
||||
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
|
||||
## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
|
||||
|
||||
ifeq ($(TOOLCHAIN),intel)
|
||||
OMP_FLAG = -qopenmp
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||
## PGO has been turned off, now tested and found to be negative optimization
|
||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||
|
||||
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
||||
endif
|
||||
|
||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||
-Dfortran3 -Dnewc $(MKL_INC)
|
||||
else
|
||||
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
|
||||
## PGO_MODE is ignored in this branch.
|
||||
OMP_FLAG = -mp
|
||||
CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
|
||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
|
||||
$(MKL_INC) $(POLINT6_FLAG)
|
||||
TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
|
||||
-Dfortran3 -Dnewc $(MKL_INC)
|
||||
endif
|
||||
include makefile.inc
|
||||
|
||||
## polint(ordn=6) kernel selector:
|
||||
## 1 (default): barycentric fast path
|
||||
## 0 : fallback to Neville path
|
||||
POLINT6_USE_BARY ?= 1
|
||||
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
|
||||
## Legacy GNU/OpenMPI flags
|
||||
CXXBASEFLAGS = -O3 -march=native -Wno-deprecated -Dfortran3 -Dnewc $(INTERP_LB_FLAGS)
|
||||
F90BASEFLAGS = -O3 -march=native -cpp -fallow-argument-mismatch $(POLINT6_FLAG)
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
CXXAPPFLAGS = $(CXXBASEFLAGS)
|
||||
f90appflags = $(F90BASEFLAGS)
|
||||
else
|
||||
CXXAPPFLAGS = $(CXXBASEFLAGS)
|
||||
f90appflags = $(F90BASEFLAGS)
|
||||
endif
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
@@ -63,14 +34,6 @@ endif
|
||||
.cu.o:
|
||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||
|
||||
# CUDA rewrite of BSSN RHS (drop-in replacement for bssn_rhs_c + stencil helpers)
|
||||
bssn_rhs_cuda.o: bssn_rhs_cuda.cu bssn_rhs.h macrodef.h
|
||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||
|
||||
# CUDA rewrite of Z4C Cartesian RHS
|
||||
z4c_rhs_cuda.o: z4c_rhs_cuda.cu z4c_rhs_cuda.h bssn_rhs.h macrodef.h ricci_gamma.h
|
||||
$(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH)
|
||||
|
||||
# C rewrite of BSSN RHS kernel and helpers
|
||||
bssn_rhs_c.o: bssn_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
@@ -84,83 +47,42 @@ fdderivs_c.o: fdderivs_c.C
|
||||
kodiss_c.o: kodiss_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_c.o: lopsided_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
lopsided_c.o: lopsided_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
z4c_rhs_c.o: z4c_rhs_c.C
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
TwoPunctures.o: TwoPunctures.C
|
||||
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
|
||||
|
||||
TwoPunctureABE.o: TwoPunctureABE.C
|
||||
${CXX} $(TP_OPTFLAGS) $(OMP_FLAG) -c $< -o $@
|
||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||
TP_OPTFLAGS = $(CXXBASEFLAGS) $(TP_OPENMP_FLAGS)
|
||||
|
||||
TwoPunctures.o: TwoPunctures.C
|
||||
${CXX} $(TP_OPTFLAGS) -c $< -o $@
|
||||
|
||||
TwoPunctureABE.o: TwoPunctureABE.C
|
||||
${CXX} $(TP_OPTFLAGS) -c $< -o $@
|
||||
|
||||
# Input files
|
||||
|
||||
## CUDA BSSN RHS switch
|
||||
## 1 : use the rewritten CUDA bssn_rhs backend
|
||||
## 0 : keep the normal CPU/Fortran selection below
|
||||
USE_CUDA_BSSN ?= 0
|
||||
USE_CUDA_Z4C ?= 0
|
||||
|
||||
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
|
||||
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
|
||||
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
|
||||
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
|
||||
|
||||
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
||||
CFILES_CPU =
|
||||
else
|
||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
||||
CFILES_CPU = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
|
||||
endif
|
||||
|
||||
CFILES_CUDA_BSSN = bssn_rhs_cuda.o
|
||||
|
||||
ifeq ($(USE_CUDA_BSSN),1)
|
||||
CFILES = $(CFILES_CUDA_BSSN)
|
||||
else
|
||||
CFILES = $(CFILES_CPU)
|
||||
endif
|
||||
|
||||
ifeq ($(USE_CUDA_Z4C),1)
|
||||
CFILES += z4c_rhs_cuda.o
|
||||
Z4C_F90_OBJ =
|
||||
else ifeq ($(USE_CXX_Z4C_KERNELS),1)
|
||||
CFILES += z4c_rhs_c.o
|
||||
Z4C_F90_OBJ =
|
||||
else
|
||||
Z4C_F90_OBJ = Z4c_rhs.o
|
||||
endif
|
||||
|
||||
## RK4 kernel switch (independent from USE_CXX_KERNELS)
|
||||
ifeq ($(USE_CXX_RK4),1)
|
||||
RK4_C_OBJ = rungekutta4_rout_c.o
|
||||
RK4_F90_OBJ =
|
||||
else
|
||||
RK4_C_OBJ =
|
||||
RK4_F90_OBJ = rungekutta4_rout.o
|
||||
endif
|
||||
|
||||
CFILES += $(RK4_C_OBJ)
|
||||
ABE_CUDA_CFILES = $(CFILES_CUDA_BSSN) z4c_rhs_cuda.o $(RK4_C_OBJ)
|
||||
|
||||
ABE_LDLIBS = $(LDLIBS)
|
||||
ifeq ($(USE_CUDA_BSSN),1)
|
||||
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
|
||||
endif
|
||||
ifeq ($(USE_CUDA_Z4C),1)
|
||||
ABE_LDLIBS += -lcudart $(CUDA_LIB_PATH)
|
||||
endif
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
# Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below
|
||||
CFILES =
|
||||
else
|
||||
# C++ mode (default): C rewrite of bssn_rhs and helper kernels
|
||||
CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_kodis_c.o
|
||||
endif
|
||||
|
||||
## RK4 kernel switch (independent from USE_CXX_KERNELS)
|
||||
ifeq ($(USE_CXX_RK4),1)
|
||||
CFILES += rungekutta4_rout_c.o
|
||||
RK4_F90_OBJ =
|
||||
else
|
||||
RK4_F90_OBJ = rungekutta4_rout.o
|
||||
endif
|
||||
|
||||
C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
cgh.o bssn_class.o surface_integral.o ShellPatch.o\
|
||||
@@ -169,7 +91,7 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\
|
||||
NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o
|
||||
|
||||
#C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
cgh.o surface_integral.o ShellPatch.o\
|
||||
bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\
|
||||
bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\
|
||||
@@ -177,13 +99,13 @@ C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\
|
||||
NullShellPatch2_Evo.o \
|
||||
bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o
|
||||
|
||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
|
||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||
fadmquantites_bssn.o $(Z4C_F90_OBJ) Z4c_rhs_ss.o point_diff_new_sh.o\
|
||||
F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\
|
||||
prolongrestrict_cell.o prolongrestrict_vertex.o\
|
||||
$(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\
|
||||
lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\
|
||||
shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\
|
||||
getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\
|
||||
fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\
|
||||
cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\
|
||||
getnpem2.o empart.o NullNews.o fourdcurvature.o\
|
||||
bssn2adm.o adm_constraint.o adm_ricci_gamma.o\
|
||||
@@ -207,10 +129,10 @@ initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o
|
||||
|
||||
TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o
|
||||
|
||||
#CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
||||
CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o
|
||||
|
||||
# file dependences
|
||||
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||
$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh
|
||||
|
||||
$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
||||
@@ -221,7 +143,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||
empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\
|
||||
initial_null2.h NullShellPatch2.h
|
||||
|
||||
#$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||
$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||
misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\
|
||||
rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\
|
||||
cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\
|
||||
@@ -233,7 +155,7 @@ $(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\
|
||||
|
||||
$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h
|
||||
|
||||
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(ABE_CUDA_CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
||||
$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h
|
||||
|
||||
TwoPunctureFILES: TwoPunctures.h
|
||||
|
||||
@@ -243,18 +165,13 @@ misc.o : zbesh.o
|
||||
|
||||
# projects
|
||||
ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(ABE_LDLIBS)
|
||||
|
||||
ABE_CUDA: USE_CUDA_BSSN=1
|
||||
ABE_CUDA: USE_CUDA_Z4C=1
|
||||
ABE_CUDA: $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(ABE_CUDA_CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) -lcudart $(CUDA_LIB_PATH)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS)
|
||||
|
||||
#ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||
# $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||
ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES)
|
||||
$(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS)
|
||||
|
||||
TwoPunctureABE: $(TwoPunctureFILES)
|
||||
$(CLINKER) $(TP_OPTFLAGS) $(OMP_FLAG) -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||
TwoPunctureABE: $(TwoPunctureFILES)
|
||||
$(CLINKER) $(TP_OPTFLAGS) -o $@ $(TwoPunctureFILES) $(LDLIBS)
|
||||
|
||||
clean:
|
||||
rm *.o ABE ABE_CUDA ABEGPU TwoPunctureABE make.log -f
|
||||
rm *.o ABE ABEGPU TwoPunctureABE make.log -f
|
||||
|
||||
94
AMSS_NCKU_source/makefile.inc
Executable file → Normal file
94
AMSS_NCKU_source/makefile.inc
Executable file → Normal file
@@ -1,12 +1,27 @@
|
||||
## Toolchain selection
|
||||
## nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default)
|
||||
## intel : Intel oneAPI toolchain (legacy path)
|
||||
TOOLCHAIN ?= nvhpc
|
||||
## Legacy GNU/OpenMPI toolchain configuration
|
||||
|
||||
## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
|
||||
## opt : (default) maximum performance with PGO profile-guided optimization
|
||||
## instrument : PGO Phase 1 instrumentation to collect fresh profile data
|
||||
PGO_MODE ?= opt
|
||||
## OpenMPI wrappers are installed but may not be on PATH.
|
||||
OMPI_BIN ?= /usr/lib64/openmpi/bin
|
||||
|
||||
## Wrapper compilers
|
||||
f90 = $(OMPI_BIN)/mpifort
|
||||
f77 = $(OMPI_BIN)/mpifort
|
||||
CXX = $(OMPI_BIN)/mpicxx
|
||||
CC = $(OMPI_BIN)/mpicc
|
||||
CLINKER = $(OMPI_BIN)/mpicxx
|
||||
|
||||
## Extra include flags are not needed when using the OpenMPI wrappers.
|
||||
filein =
|
||||
|
||||
## BLAS/LAPACK backend:
|
||||
## OpenBLAS on this system provides BLAS, CBLAS and LAPACK symbols.
|
||||
BLAS_LAPACK_LIB ?= /lib64/libopenblaso.so.0
|
||||
LDLIBS = $(BLAS_LAPACK_LIB) -lgfortran -lpthread -lm -ldl
|
||||
|
||||
## PGO build mode switch
|
||||
## off : default legacy GNU build without PGO
|
||||
## instrument : accepted for compatibility, currently same as off
|
||||
PGO_MODE ?= off
|
||||
|
||||
## Interp_Points load balance profiling mode
|
||||
## off : (default) no load balance instrumentation
|
||||
@@ -22,70 +37,19 @@ else
|
||||
INTERP_LB_FLAGS =
|
||||
endif
|
||||
|
||||
MKLROOT ?= /home/intel/oneapi/mkl/latest
|
||||
MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
|
||||
MKL_INC ?= -I$(MKLROOT)/include
|
||||
|
||||
NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
|
||||
CUDA_HOME ?= $(NVHPC_ROOT)/cuda
|
||||
CUDA_ARCH ?= sm_80
|
||||
|
||||
## Kernel implementation switch
|
||||
## 1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
|
||||
## 0 : fall back to original Fortran kernels
|
||||
USE_CXX_KERNELS ?= 1
|
||||
|
||||
## Z4C Cartesian RHS kernel switch
|
||||
## 1 (default) : use C++ rewrite of Z4c_rhs (main Cartesian path faster)
|
||||
## 0 : use original Fortran Z4c_rhs.o
|
||||
USE_CXX_Z4C_KERNELS ?= 1
|
||||
|
||||
## RK4 kernel implementation switch
|
||||
## 1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
|
||||
## 1 (default) : use C/C++ rewrite of rungekutta4_rout
|
||||
## 0 : use original Fortran rungekutta4_rout.o
|
||||
USE_CXX_RK4 ?= 1
|
||||
|
||||
## Memory allocator switch
|
||||
## 1 (default) : link Intel oneTBB allocator (libtbbmalloc)
|
||||
## 0 : use system default allocator (ptmalloc)
|
||||
USE_TBBMALLOC ?= 1
|
||||
TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
|
||||
ifneq ($(wildcard $(TBBMALLOC_SO)),)
|
||||
TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
|
||||
else
|
||||
TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
|
||||
endif
|
||||
## OpenMP is only used for TwoPunctures on the legacy toolchain.
|
||||
TP_OPENMP_FLAGS ?= -fopenmp
|
||||
|
||||
ifeq ($(TOOLCHAIN),intel)
|
||||
f90 = ifx
|
||||
f77 = ifx
|
||||
CXX = icpx
|
||||
CC = icx
|
||||
CLINKER = mpiicpx
|
||||
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
|
||||
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
|
||||
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
|
||||
-lifcore -limf -liomp5 -lpthread -lm -ldl \
|
||||
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
|
||||
else ifeq ($(TOOLCHAIN),nvhpc)
|
||||
f90 = mpifort
|
||||
f77 = mpifort
|
||||
CXX = mpicxx
|
||||
CC = mpicc
|
||||
CLINKER = mpicxx
|
||||
|
||||
filein = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
|
||||
LDLIBS = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
|
||||
-lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
|
||||
-lpthread -lm -ldl \
|
||||
-L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
|
||||
-fortranlibs
|
||||
endif
|
||||
|
||||
ifeq ($(USE_TBBMALLOC),1)
|
||||
LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
|
||||
endif
|
||||
|
||||
Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
|
||||
CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
|
||||
|
||||
@@ -1,725 +0,0 @@
|
||||
#include "macrodef.h"
|
||||
#include "bssn_rhs.h"
|
||||
#include "fmisc.h"
|
||||
#include "ricci_gamma.h"
|
||||
#include "share_func.h"
|
||||
#include "tool.h"
|
||||
#include <vector>
|
||||
|
||||
#ifdef fortran1
|
||||
#define f_constraint_bssn constraint_bssn
|
||||
#define f_z4c_rhs_point z4c_rhs_point
|
||||
#endif
|
||||
#ifdef fortran2
|
||||
#define f_constraint_bssn CONSTRAINT_BSSN
|
||||
#define f_z4c_rhs_point Z4C_RHS_POINT
|
||||
#endif
|
||||
#ifdef fortran3
|
||||
#define f_constraint_bssn constraint_bssn_
|
||||
#define f_z4c_rhs_point z4c_rhs_point_
|
||||
#endif
|
||||
|
||||
extern "C" void f_constraint_bssn(int *, double *, double *, double *,
|
||||
double *, double *,
|
||||
double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *,
|
||||
double *, double *, double *, double *, double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *, double *, double *, double *, double *, double *,
|
||||
double *, double *, double *,
|
||||
int &);
|
||||
|
||||
extern "C" void f_z4c_rhs_point(
|
||||
double &A11,
|
||||
double &A12,
|
||||
double &A13,
|
||||
double &A22,
|
||||
double &A23,
|
||||
double &A33,
|
||||
double &alpha,
|
||||
double &B1,
|
||||
double &B2,
|
||||
double &B3,
|
||||
double &beta1,
|
||||
double &beta2,
|
||||
double &beta3,
|
||||
double &chi,
|
||||
double &chiDivFloor,
|
||||
double &da1,
|
||||
double &dA111,
|
||||
double &dA112,
|
||||
double &dA113,
|
||||
double &dA122,
|
||||
double &dA123,
|
||||
double &dA133,
|
||||
double &da2,
|
||||
double &dA211,
|
||||
double &dA212,
|
||||
double &dA213,
|
||||
double &dA222,
|
||||
double &dA223,
|
||||
double &dA233,
|
||||
double &da3,
|
||||
double &dA311,
|
||||
double &dA312,
|
||||
double &dA313,
|
||||
double &dA322,
|
||||
double &dA323,
|
||||
double &dA333,
|
||||
double &db11,
|
||||
double &dB11,
|
||||
double &db12,
|
||||
double &dB12,
|
||||
double &db13,
|
||||
double &dB13,
|
||||
double &db21,
|
||||
double &dB21,
|
||||
double &db22,
|
||||
double &dB22,
|
||||
double &db23,
|
||||
double &dB23,
|
||||
double &db31,
|
||||
double &dB31,
|
||||
double &db32,
|
||||
double &dB32,
|
||||
double &db33,
|
||||
double &dB33,
|
||||
double &dchi1,
|
||||
double &dchi2,
|
||||
double &dchi3,
|
||||
double &dda11,
|
||||
double &dda12,
|
||||
double &dda13,
|
||||
double &dda22,
|
||||
double &dda23,
|
||||
double &dda33,
|
||||
double &ddb111,
|
||||
double &ddb112,
|
||||
double &ddb113,
|
||||
double &ddb121,
|
||||
double &ddb122,
|
||||
double &ddb123,
|
||||
double &ddb131,
|
||||
double &ddb132,
|
||||
double &ddb133,
|
||||
double &ddb221,
|
||||
double &ddb222,
|
||||
double &ddb223,
|
||||
double &ddb231,
|
||||
double &ddb232,
|
||||
double &ddb233,
|
||||
double &ddb331,
|
||||
double &ddb332,
|
||||
double &ddb333,
|
||||
double &ddchi11,
|
||||
double &ddchi12,
|
||||
double &ddchi13,
|
||||
double &ddchi22,
|
||||
double &ddchi23,
|
||||
double &ddchi33,
|
||||
double &deldelg1111,
|
||||
double &deldelg1112,
|
||||
double &deldelg1113,
|
||||
double &deldelg1122,
|
||||
double &deldelg1123,
|
||||
double &deldelg1133,
|
||||
double &deldelg1211,
|
||||
double &deldelg1212,
|
||||
double &deldelg1213,
|
||||
double &deldelg1222,
|
||||
double &deldelg1223,
|
||||
double &deldelg1233,
|
||||
double &deldelg1311,
|
||||
double &deldelg1312,
|
||||
double &deldelg1313,
|
||||
double &deldelg1322,
|
||||
double &deldelg1323,
|
||||
double &deldelg1333,
|
||||
double &deldelg2211,
|
||||
double &deldelg2212,
|
||||
double &deldelg2213,
|
||||
double &deldelg2222,
|
||||
double &deldelg2223,
|
||||
double &deldelg2233,
|
||||
double &deldelg2311,
|
||||
double &deldelg2312,
|
||||
double &deldelg2313,
|
||||
double &deldelg2322,
|
||||
double &deldelg2323,
|
||||
double &deldelg2333,
|
||||
double &deldelg3311,
|
||||
double &deldelg3312,
|
||||
double &deldelg3313,
|
||||
double &deldelg3322,
|
||||
double &deldelg3323,
|
||||
double &deldelg3333,
|
||||
double &delG11,
|
||||
double &delg111,
|
||||
double &delg112,
|
||||
double &delg113,
|
||||
double &delG12,
|
||||
double &delg122,
|
||||
double &delg123,
|
||||
double &delG13,
|
||||
double &delg133,
|
||||
double &delG21,
|
||||
double &delg211,
|
||||
double &delg212,
|
||||
double &delg213,
|
||||
double &delG22,
|
||||
double &delg222,
|
||||
double &delg223,
|
||||
double &delG23,
|
||||
double &delg233,
|
||||
double &delG31,
|
||||
double &delg311,
|
||||
double &delg312,
|
||||
double &delg313,
|
||||
double &delG32,
|
||||
double &delg322,
|
||||
double &delg323,
|
||||
double &delG33,
|
||||
double &delg333,
|
||||
double &dKhat1,
|
||||
double &dKhat2,
|
||||
double &dKhat3,
|
||||
double &dTheta1,
|
||||
double &dTheta2,
|
||||
double &dTheta3,
|
||||
double &G1,
|
||||
double &g11,
|
||||
double &g12,
|
||||
double &g13,
|
||||
double &G2,
|
||||
double &g22,
|
||||
double &g23,
|
||||
double &G3,
|
||||
double &g33,
|
||||
double &kappa1,
|
||||
double &kappa2,
|
||||
double &Khat,
|
||||
double &rA11,
|
||||
double &rA12,
|
||||
double &rA13,
|
||||
double &rA22,
|
||||
double &rA23,
|
||||
double &rA33,
|
||||
double &rchi,
|
||||
double &rG1,
|
||||
double &rg11,
|
||||
double &rg12,
|
||||
double &rg13,
|
||||
double &rG2,
|
||||
double &rg22,
|
||||
double &rg23,
|
||||
double &rG3,
|
||||
double &rg33,
|
||||
double &rKhat,
|
||||
double &rTheta,
|
||||
double &Theta);
|
||||
|
||||
static inline void z4c_contract_gamma(
|
||||
const double gxx, const double gxy, const double gxz,
|
||||
const double gyy, const double gyz, const double gzz,
|
||||
const double gxxx, const double gxyx, const double gxzx,
|
||||
const double gyyx, const double gyzx, const double gzzx,
|
||||
const double gxxy, const double gxyy, const double gxzy,
|
||||
const double gyyy, const double gyzy, const double gzzy,
|
||||
const double gxxz, const double gxyz, const double gxzz,
|
||||
const double gyyz, const double gyzz, const double gzzz,
|
||||
double &Gamxa, double &Gamya, double &Gamza)
|
||||
{
|
||||
double det = gxx * gyy * gzz + gxy * gyz * gxz + gxz * gxy * gyz -
|
||||
gxz * gyy * gxz - gxy * gxy * gzz - gxx * gyz * gyz;
|
||||
const double gupxx = (gyy * gzz - gyz * gyz) / det;
|
||||
const double gupxy = -(gxy * gzz - gyz * gxz) / det;
|
||||
const double gupxz = (gxy * gyz - gyy * gxz) / det;
|
||||
const double gupyy = (gxx * gzz - gxz * gxz) / det;
|
||||
const double gupyz = -(gxx * gyz - gxy * gxz) / det;
|
||||
const double gupzz = (gxx * gyy - gxy * gxy) / det;
|
||||
|
||||
const double Gamxxx = 0.5 * (gupxx * gxxx + gupxy * (2.0 * gxyx - gxxy) + gupxz * (2.0 * gxzx - gxxz));
|
||||
const double Gamyxx = 0.5 * (gupxy * gxxx + gupyy * (2.0 * gxyx - gxxy) + gupyz * (2.0 * gxzx - gxxz));
|
||||
const double Gamzxx = 0.5 * (gupxz * gxxx + gupyz * (2.0 * gxyx - gxxy) + gupzz * (2.0 * gxzx - gxxz));
|
||||
|
||||
const double Gamxyy = 0.5 * (gupxx * (2.0 * gxyy - gyyx) + gupxy * gyyy + gupxz * (2.0 * gyzy - gyyz));
|
||||
const double Gamyyy = 0.5 * (gupxy * (2.0 * gxyy - gyyx) + gupyy * gyyy + gupyz * (2.0 * gyzy - gyyz));
|
||||
const double Gamzyy = 0.5 * (gupxz * (2.0 * gxyy - gyyx) + gupyz * gyyy + gupzz * (2.0 * gyzy - gyyz));
|
||||
|
||||
const double Gamxzz = 0.5 * (gupxx * (2.0 * gxzz - gzzx) + gupxy * (2.0 * gyzz - gzzy) + gupxz * gzzz);
|
||||
const double Gamyzz = 0.5 * (gupxy * (2.0 * gxzz - gzzx) + gupyy * (2.0 * gyzz - gzzy) + gupyz * gzzz);
|
||||
const double Gamzzz = 0.5 * (gupxz * (2.0 * gxzz - gzzx) + gupyz * (2.0 * gyzz - gzzy) + gupzz * gzzz);
|
||||
|
||||
const double Gamxxy = 0.5 * (gupxx * gxxy + gupxy * gyyx + gupxz * (gxzy + gyzx - gxyz));
|
||||
const double Gamyxy = 0.5 * (gupxy * gxxy + gupyy * gyyx + gupyz * (gxzy + gyzx - gxyz));
|
||||
const double Gamzxy = 0.5 * (gupxz * gxxy + gupyz * gyyx + gupzz * (gxzy + gyzx - gxyz));
|
||||
|
||||
const double Gamxxz = 0.5 * (gupxx * gxxz + gupxy * (gxyz + gyzx - gxzy) + gupxz * gzzx);
|
||||
const double Gamyxz = 0.5 * (gupxy * gxxz + gupyy * (gxyz + gyzx - gxzy) + gupyz * gzzx);
|
||||
const double Gamzxz = 0.5 * (gupxz * gxxz + gupyz * (gxyz + gyzx - gxzy) + gupzz * gzzx);
|
||||
|
||||
const double Gamxyz = 0.5 * (gupxx * (gxyz + gxzy - gyzx) + gupxy * gyyz + gupxz * gzzy);
|
||||
const double Gamyyz = 0.5 * (gupxy * (gxyz + gxzy - gyzx) + gupyy * gyyz + gupyz * gzzy);
|
||||
const double Gamzyz = 0.5 * (gupxz * (gxyz + gxzy - gyzx) + gupyz * gyyz + gupzz * gzzy);
|
||||
|
||||
Gamxa = gupxx * Gamxxx + gupyy * Gamxyy + gupzz * Gamxzz +
|
||||
2.0 * (gupxy * Gamxxy + gupxz * Gamxxz + gupyz * Gamxyz);
|
||||
Gamya = gupxx * Gamyxx + gupyy * Gamyyy + gupzz * Gamyzz +
|
||||
2.0 * (gupxy * Gamyxy + gupxz * Gamyxz + gupyz * Gamyyz);
|
||||
Gamza = gupxx * Gamzxx + gupyy * Gamzyy + gupzz * Gamzzz +
|
||||
2.0 * (gupxy * Gamzxy + gupxz * Gamzxz + gupyz * Gamzyz);
|
||||
}
|
||||
|
||||
static int compute_rhs_z4c_cartesian(
|
||||
int *ex, double &T, double *X, double *Y, double *Z,
|
||||
double *chi_state, double *chi_constraints, double *trK,
|
||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
double *TZ,
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
double *TZ_rhs,
|
||||
double *rho, double *Sx, double *Sy, double *Sz,
|
||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
|
||||
int &Symmetry, int &Lev, double &eps, int &co)
|
||||
{
|
||||
(void)T;
|
||||
|
||||
const int nx = ex[0];
|
||||
const int ny = ex[1];
|
||||
const int nz = ex[2];
|
||||
const int all = nx * ny * nz;
|
||||
|
||||
double alpn1[all], chin1[all], gxx[all], gyy[all], gzz[all];
|
||||
double chix[all], chiy[all], chiz[all], chixx[all], chixy[all], chixz[all], chiyy[all], chiyz[all], chizz[all];
|
||||
double gxxx[all], gxyx[all], gxzx[all], gyyx[all], gyzx[all], gzzx[all];
|
||||
double gxxy[all], gxyy[all], gxzy[all], gyyy[all], gyzy[all], gzzy[all];
|
||||
double gxxz[all], gxyz[all], gxzz[all], gyyz[all], gyzz[all], gzzz[all];
|
||||
double gxxxx[all], gxxxy[all], gxxxz[all], gxxyy[all], gxxyz[all], gxxzz[all];
|
||||
double gxyxx[all], gxyxy[all], gxyxz[all], gxyyy[all], gxyyz[all], gxyzz[all];
|
||||
double gxzxx[all], gxzxy[all], gxzxz[all], gxzyy[all], gxzyz[all], gxzzz[all];
|
||||
double gyyxx[all], gyyxy[all], gyyxz[all], gyyyy[all], gyyyz[all], gyyzz[all];
|
||||
double gyzxx[all], gyzxy[all], gyzxz[all], gyzyy[all], gyzyz[all], gyzzz[all];
|
||||
double gzzxx[all], gzzxy[all], gzzxz[all], gzzyy[all], gzzyz[all], gzzzz[all];
|
||||
double Lapx[all], Lapy[all], Lapz[all], Lapxx[all], Lapxy[all], Lapxz[all], Lapyy[all], Lapyz[all], Lapzz[all];
|
||||
double betaxx[all], betaxy[all], betaxz[all], betayx[all], betayy[all], betayz[all], betazx[all], betazy[all], betazz[all];
|
||||
double dBxx[all], dBxy[all], dBxz[all], dByx[all], dByy[all], dByz[all], dBzx[all], dBzy[all], dBzz[all];
|
||||
double sfxxx[all], sfxxy[all], sfxxz[all], sfxyy[all], sfxyz[all], sfxzz[all];
|
||||
double sfyxx[all], sfyxy[all], sfyxz[all], sfyyy[all], sfyyz[all], sfyzz[all];
|
||||
double sfzxx[all], sfzxy[all], sfzxz[all], sfzyy[all], sfzyz[all], sfzzz[all];
|
||||
double Gamxx[all], Gamxy[all], Gamxz[all], Gamyx[all], Gamyy[all], Gamyz[all], Gamzx[all], Gamzy[all], Gamzz[all];
|
||||
double Kx[all], Ky[all], Kz[all], TZx[all], TZy[all], TZz[all];
|
||||
double Axxx[all], Axxy[all], Axxz[all], Axyx[all], Axyy[all], Axyz[all];
|
||||
double Axzx[all], Axzy[all], Axzz[all], Ayyx[all], Ayyy[all], Ayyz[all];
|
||||
double Ayzx[all], Ayzy[all], Ayzz[all], Azzx[all], Azzy[all], Azzz[all];
|
||||
|
||||
const double SSS[3] = {1.0, 1.0, 1.0};
|
||||
const double AAS[3] = {-1.0, -1.0, 1.0};
|
||||
const double ASA[3] = {-1.0, 1.0, -1.0};
|
||||
const double SAA[3] = {1.0, -1.0, -1.0};
|
||||
const double ASS[3] = {-1.0, 1.0, 1.0};
|
||||
const double SAS[3] = {1.0, -1.0, 1.0};
|
||||
const double SSA[3] = {1.0, 1.0, -1.0};
|
||||
|
||||
const double ONE = 1.0;
|
||||
const double TWO = 2.0;
|
||||
const double ZEO = 0.0;
|
||||
double chiDivfloor = 1.0e-5;
|
||||
|
||||
double kappa1 = 2.0e-2;
|
||||
double kappa2 = 0.0;
|
||||
double FF = 0.75;
|
||||
double eta = 2.0;
|
||||
|
||||
for (int idx = 0; idx < all; ++idx)
|
||||
{
|
||||
alpn1[idx] = Lap[idx] + ONE;
|
||||
chin1[idx] = chi_state[idx] + ONE;
|
||||
gxx[idx] = dxx[idx] + ONE;
|
||||
gyy[idx] = dyy[idx] + ONE;
|
||||
gzz[idx] = dzz[idx] + ONE;
|
||||
}
|
||||
|
||||
fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
fderivs(ex, dtSfx, dBxx, dBxy, dBxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, dtSfy, dByx, dByy, dByz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, dtSfz, dBzx, dBzy, dBzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
fderivs(ex, chi_state, chix, chiy, chiz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
|
||||
fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
|
||||
fdderivs(ex, dxx, gxxxx, gxxxy, gxxxz, gxxyy, gxxyz, gxxzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, dyy, gyyxx, gyyxy, gyyxz, gyyyy, gyyyz, gyyzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, dzz, gzzxx, gzzxy, gzzxz, gzzyy, gzzyz, gzzzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, gxy, gxyxx, gxyxy, gxyxz, gxyyy, gxyyz, gxyzz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, gxz, gxzxx, gxzxy, gxzxz, gxzyy, gxzyz, gxzzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
fdderivs(ex, gyz, gyzxx, gyzxy, gyzxz, gyzyy, gyzyz, gyzzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
|
||||
|
||||
fderivs(ex, Gamx, Gamxx, Gamxy, Gamxz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, Gamy, Gamyx, Gamyy, Gamyz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, Gamz, Gamzx, Gamzy, Gamzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
|
||||
fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, trK, Kx, Ky, Kz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, TZ, TZx, TZy, TZz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
|
||||
fdderivs(ex, betax, sfxxx, sfxxy, sfxxz, sfxyy, sfxyz, sfxzz, X, Y, Z, -1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, betay, sfyxx, sfyxy, sfyxz, sfyyy, sfyyz, sfyzz, X, Y, Z, 1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, betaz, sfzxx, sfzxy, sfzxz, sfzyy, sfzyz, sfzzz, X, Y, Z, 1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
|
||||
fdderivs(ex, chi_state, chixx, chixy, chixz, chiyy, chiyz, chizz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fdderivs(ex, Lap, Lapxx, Lapxy, Lapxz, Lapyy, Lapyz, Lapzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
|
||||
fderivs(ex, Axx, Axxx, Axxy, Axxz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, Axy, Axyx, Axyy, Axyz, X, Y, Z, -1.0, -1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, Axz, Axzx, Axzy, Axzz, X, Y, Z, -1.0, 1.0, -1.0, Symmetry, Lev);
|
||||
fderivs(ex, Ayy, Ayyx, Ayyy, Ayyz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
fderivs(ex, Ayz, Ayzx, Ayzy, Ayzz, X, Y, Z, 1.0, -1.0, -1.0, Symmetry, Lev);
|
||||
fderivs(ex, Azz, Azzx, Azzy, Azzz, X, Y, Z, 1.0, 1.0, 1.0, Symmetry, Lev);
|
||||
|
||||
for (int idx = 0; idx < all; ++idx)
|
||||
{
|
||||
double point_kappa1 = 0.0;
|
||||
f_z4c_rhs_point(
|
||||
Axx[idx], Axy[idx], Axz[idx], Ayy[idx], Ayz[idx], Azz[idx],
|
||||
alpn1[idx], dtSfx[idx], dtSfy[idx], dtSfz[idx],
|
||||
betax[idx], betay[idx], betaz[idx],
|
||||
chin1[idx], chiDivfloor,
|
||||
Lapx[idx],
|
||||
Axxx[idx], Axyx[idx], Axzx[idx], Ayyx[idx], Ayzx[idx], Azzx[idx],
|
||||
Lapy[idx],
|
||||
Axxy[idx], Axyy[idx], Axzy[idx], Ayyy[idx], Ayzy[idx], Azzy[idx],
|
||||
Lapz[idx],
|
||||
Axxz[idx], Axyz[idx], Axzz[idx], Ayyz[idx], Ayzz[idx], Azzz[idx],
|
||||
betaxx[idx], dBxx[idx], betayx[idx], dByx[idx], betazx[idx], dBzx[idx],
|
||||
betaxy[idx], dBxy[idx], betayy[idx], dByy[idx], betazy[idx], dBzy[idx],
|
||||
betaxz[idx], dBxz[idx], betayz[idx], dByz[idx], betazz[idx], dBzz[idx],
|
||||
chix[idx], chiy[idx], chiz[idx],
|
||||
Lapxx[idx], Lapxy[idx], Lapxz[idx], Lapyy[idx], Lapyz[idx], Lapzz[idx],
|
||||
sfxxx[idx], sfyxx[idx], sfzxx[idx],
|
||||
sfxxy[idx], sfyxy[idx], sfzxy[idx],
|
||||
sfxxz[idx], sfyxz[idx], sfzxz[idx],
|
||||
sfxyy[idx], sfyyy[idx], sfzyy[idx],
|
||||
sfxyz[idx], sfyyz[idx], sfzyz[idx],
|
||||
sfxzz[idx], sfyzz[idx], sfzzz[idx],
|
||||
chixx[idx], chixy[idx], chixz[idx], chiyy[idx], chiyz[idx], chizz[idx],
|
||||
gxxxx[idx], gxyxx[idx], gxzxx[idx], gyyxx[idx], gyzxx[idx], gzzxx[idx],
|
||||
gxxxy[idx], gxyxy[idx], gxzxy[idx], gyyxy[idx], gyzxy[idx], gzzxy[idx],
|
||||
gxxxz[idx], gxyxz[idx], gxzxz[idx], gyyxz[idx], gyzxz[idx], gzzxz[idx],
|
||||
gxxyy[idx], gxyyy[idx], gxzyy[idx], gyyyy[idx], gyzyy[idx], gzzyy[idx],
|
||||
gxxyz[idx], gxyyz[idx], gxzyz[idx], gyyyz[idx], gyzyz[idx], gzzyz[idx],
|
||||
gxxzz[idx], gxyzz[idx], gxzzz[idx], gyyzz[idx], gyzzz[idx], gzzzz[idx],
|
||||
Gamxx[idx], gxxx[idx], gxyx[idx], gxzx[idx],
|
||||
Gamyx[idx], gyyx[idx], gyzx[idx],
|
||||
Gamzx[idx], gzzx[idx],
|
||||
Gamxy[idx], gxxy[idx], gxyy[idx], gxzy[idx],
|
||||
Gamyy[idx], gyyy[idx], gyzy[idx],
|
||||
Gamzy[idx], gzzy[idx],
|
||||
Gamxz[idx], gxxz[idx], gxyz[idx], gxzz[idx],
|
||||
Gamyz[idx], gyyz[idx], gyzz[idx],
|
||||
Gamzz[idx], gzzz[idx],
|
||||
Kx[idx], Ky[idx], Kz[idx],
|
||||
TZx[idx], TZy[idx], TZz[idx],
|
||||
Gamx[idx], gxx[idx], gxy[idx], gxz[idx],
|
||||
Gamy[idx], gyy[idx], gyz[idx],
|
||||
Gamz[idx], gzz[idx],
|
||||
point_kappa1, kappa2,
|
||||
trK[idx],
|
||||
Axx_rhs[idx], Axy_rhs[idx], Axz_rhs[idx], Ayy_rhs[idx], Ayz_rhs[idx], Azz_rhs[idx],
|
||||
chi_rhs[idx],
|
||||
Gamx_rhs[idx], gxx_rhs[idx], gxy_rhs[idx], gxz_rhs[idx],
|
||||
Gamy_rhs[idx], gyy_rhs[idx], gyz_rhs[idx],
|
||||
Gamz_rhs[idx], gzz_rhs[idx], trK_rhs[idx], TZ_rhs[idx], TZ[idx]);
|
||||
}
|
||||
|
||||
for (int idx = 0; idx < all; ++idx)
|
||||
Lap_rhs[idx] = -TWO * alpn1[idx] * trK[idx];
|
||||
|
||||
#if (GAUGE == 0)
|
||||
for (int idx = 0; idx < all; ++idx)
|
||||
{
|
||||
betax_rhs[idx] = FF * dtSfx[idx];
|
||||
betay_rhs[idx] = FF * dtSfy[idx];
|
||||
betaz_rhs[idx] = FF * dtSfz[idx];
|
||||
dtSfx_rhs[idx] = Gamx_rhs[idx] - eta * dtSfx[idx];
|
||||
dtSfy_rhs[idx] = Gamy_rhs[idx] - eta * dtSfy[idx];
|
||||
dtSfz_rhs[idx] = Gamz_rhs[idx] - eta * dtSfz[idx];
|
||||
}
|
||||
#elif (GAUGE == 1)
|
||||
for (int idx = 0; idx < all; ++idx)
|
||||
{
|
||||
betax_rhs[idx] = Gamx[idx] - eta * betax[idx];
|
||||
betay_rhs[idx] = Gamy[idx] - eta * betay[idx];
|
||||
betaz_rhs[idx] = Gamz[idx] - eta * betaz[idx];
|
||||
dtSfx_rhs[idx] = ZEO;
|
||||
dtSfy_rhs[idx] = ZEO;
|
||||
dtSfz_rhs[idx] = ZEO;
|
||||
}
|
||||
#else
|
||||
#error "z4c_rhs_c.C currently supports GAUGE == 0 or GAUGE == 1 for Z4C"
|
||||
#endif
|
||||
|
||||
lopsided(ex, X, Y, Z, gxx, gxx_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
lopsided(ex, X, Y, Z, gxy, gxy_rhs, betax, betay, betaz, Symmetry, AAS);
|
||||
lopsided(ex, X, Y, Z, gxz, gxz_rhs, betax, betay, betaz, Symmetry, ASA);
|
||||
lopsided(ex, X, Y, Z, gyy, gyy_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
lopsided(ex, X, Y, Z, gyz, gyz_rhs, betax, betay, betaz, Symmetry, SAA);
|
||||
lopsided(ex, X, Y, Z, gzz, gzz_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
|
||||
lopsided(ex, X, Y, Z, Axx, Axx_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
lopsided(ex, X, Y, Z, Axy, Axy_rhs, betax, betay, betaz, Symmetry, AAS);
|
||||
lopsided(ex, X, Y, Z, Axz, Axz_rhs, betax, betay, betaz, Symmetry, ASA);
|
||||
lopsided(ex, X, Y, Z, Ayy, Ayy_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
lopsided(ex, X, Y, Z, Ayz, Ayz_rhs, betax, betay, betaz, Symmetry, SAA);
|
||||
lopsided(ex, X, Y, Z, Azz, Azz_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
|
||||
lopsided(ex, X, Y, Z, chi_state, chi_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
lopsided(ex, X, Y, Z, trK, trK_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
|
||||
lopsided(ex, X, Y, Z, Gamx, Gamx_rhs, betax, betay, betaz, Symmetry, ASS);
|
||||
lopsided(ex, X, Y, Z, Gamy, Gamy_rhs, betax, betay, betaz, Symmetry, SAS);
|
||||
lopsided(ex, X, Y, Z, Gamz, Gamz_rhs, betax, betay, betaz, Symmetry, SSA);
|
||||
|
||||
lopsided(ex, X, Y, Z, Lap, Lap_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
lopsided(ex, X, Y, Z, betax, betax_rhs, betax, betay, betaz, Symmetry, ASS);
|
||||
lopsided(ex, X, Y, Z, betay, betay_rhs, betax, betay, betaz, Symmetry, SAS);
|
||||
lopsided(ex, X, Y, Z, betaz, betaz_rhs, betax, betay, betaz, Symmetry, SSA);
|
||||
#if (GAUGE == 0)
|
||||
lopsided(ex, X, Y, Z, dtSfx, dtSfx_rhs, betax, betay, betaz, Symmetry, ASS);
|
||||
lopsided(ex, X, Y, Z, dtSfy, dtSfy_rhs, betax, betay, betaz, Symmetry, SAS);
|
||||
lopsided(ex, X, Y, Z, dtSfz, dtSfz_rhs, betax, betay, betaz, Symmetry, SSA);
|
||||
#endif
|
||||
lopsided(ex, X, Y, Z, TZ, TZ_rhs, betax, betay, betaz, Symmetry, SSS);
|
||||
|
||||
for (int idx = 0; idx < all; ++idx)
|
||||
{
|
||||
double Gamxa = 0.0, Gamya = 0.0, Gamza = 0.0;
|
||||
z4c_contract_gamma(
|
||||
gxx[idx], gxy[idx], gxz[idx], gyy[idx], gyz[idx], gzz[idx],
|
||||
gxxx[idx], gxyx[idx], gxzx[idx], gyyx[idx], gyzx[idx], gzzx[idx],
|
||||
gxxy[idx], gxyy[idx], gxzy[idx], gyyy[idx], gyzy[idx], gzzy[idx],
|
||||
gxxz[idx], gxyz[idx], gxzz[idx], gyyz[idx], gyzz[idx], gzzz[idx],
|
||||
Gamxa, Gamya, Gamza);
|
||||
|
||||
TZ_rhs[idx] -= alpn1[idx] * (TWO + kappa2) * kappa1 * TZ[idx];
|
||||
trK_rhs[idx] += alpn1[idx] * kappa1 * (ONE - kappa2) * TZ[idx];
|
||||
Gamx_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamx[idx] - Gamxa);
|
||||
Gamy_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamy[idx] - Gamya);
|
||||
Gamz_rhs[idx] -= TWO * alpn1[idx] * kappa1 * (Gamz[idx] - Gamza);
|
||||
}
|
||||
|
||||
if (eps > 0.0)
|
||||
{
|
||||
kodis(ex, X, Y, Z, chi_state, chi_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, trK, trK_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, gxx, gxx_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, gxy, gxy_rhs, AAS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, gxz, gxz_rhs, ASA, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, gyy, gyy_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, gyz, gyz_rhs, SAA, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, gzz, gzz_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Axx, Axx_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Axy, Axy_rhs, AAS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Axz, Axz_rhs, ASA, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Ayy, Ayy_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Ayz, Ayz_rhs, SAA, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Azz, Azz_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Gamx, Gamx_rhs, ASS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Gamy, Gamy_rhs, SAS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Gamz, Gamz_rhs, SSA, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, Lap, Lap_rhs, SSS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, betax, betax_rhs, ASS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, betay, betay_rhs, SAS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, betaz, betaz_rhs, SSA, Symmetry, eps);
|
||||
#if (GAUGE == 0)
|
||||
kodis(ex, X, Y, Z, dtSfx, dtSfx_rhs, ASS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, dtSfy, dtSfy_rhs, SAS, Symmetry, eps);
|
||||
kodis(ex, X, Y, Z, dtSfz, dtSfz_rhs, SSA, Symmetry, eps);
|
||||
#endif
|
||||
kodis(ex, X, Y, Z, TZ, TZ_rhs, SSS, Symmetry, eps);
|
||||
}
|
||||
|
||||
if (co == 0)
|
||||
{
|
||||
#if (ABV == 0)
|
||||
f_ricci_gamma(ex, X, Y, Z,
|
||||
chi_constraints,
|
||||
dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
Symmetry);
|
||||
#endif
|
||||
f_constraint_bssn(ex, X, Y, Z,
|
||||
chi_constraints, trK,
|
||||
dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
|
||||
Symmetry);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern "C" int f_compute_rhs_Z4c(int *ex, double &T,
|
||||
double *X, double *Y, double *Z,
|
||||
double *chi, double *trK,
|
||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
double *TZ,
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
double *TZ_rhs,
|
||||
double *rho, double *Sx, double *Sy, double *Sz,
|
||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
|
||||
int &Symmetry, int &Lev, double &eps, int &co)
|
||||
{
|
||||
return compute_rhs_z4c_cartesian(
|
||||
ex, T, X, Y, Z,
|
||||
chi, chi, trK,
|
||||
dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz,
|
||||
dtSfx, dtSfy, dtSfz,
|
||||
TZ,
|
||||
chi_rhs, trK_rhs,
|
||||
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
|
||||
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
|
||||
Gamx_rhs, Gamy_rhs, Gamz_rhs,
|
||||
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
|
||||
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
|
||||
TZ_rhs,
|
||||
rho, Sx, Sy, Sz,
|
||||
Sxx, Sxy, Sxz, Syy, Syz, Szz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
|
||||
Symmetry, Lev, eps, co);
|
||||
}
|
||||
|
||||
extern "C" int f_compute_rhs_Z4cnot(int *ex, double &T,
|
||||
double *X, double *Y, double *Z,
|
||||
double *chi, double *trK,
|
||||
double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
|
||||
double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
|
||||
double *Gamx, double *Gamy, double *Gamz,
|
||||
double *Lap, double *betax, double *betay, double *betaz,
|
||||
double *dtSfx, double *dtSfy, double *dtSfz,
|
||||
double *TZ,
|
||||
double *chi_rhs, double *trK_rhs,
|
||||
double *gxx_rhs, double *gxy_rhs, double *gxz_rhs, double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
|
||||
double *Axx_rhs, double *Axy_rhs, double *Axz_rhs, double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
|
||||
double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
|
||||
double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
|
||||
double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
|
||||
double *TZ_rhs,
|
||||
double *rho, double *Sx, double *Sy, double *Sz,
|
||||
double *Sxx, double *Sxy, double *Sxz, double *Syy, double *Syz, double *Szz,
|
||||
double *Gamxxx, double *Gamxxy, double *Gamxxz, double *Gamxyy, double *Gamxyz, double *Gamxzz,
|
||||
double *Gamyxx, double *Gamyxy, double *Gamyxz, double *Gamyyy, double *Gamyyz, double *Gamyzz,
|
||||
double *Gamzxx, double *Gamzxy, double *Gamzxz, double *Gamzyy, double *Gamzyz, double *Gamzzz,
|
||||
double *Rxx, double *Rxy, double *Rxz, double *Ryy, double *Ryz, double *Rzz,
|
||||
double *Hcon, double *Mxcon, double *Mycon, double *Mzcon, double *Gmxcon, double *Gmycon, double *Gmzcon,
|
||||
int &Symmetry, int &Lev, double &eps, int &co, double &chitiny)
|
||||
{
|
||||
const int all = ex[0] * ex[1] * ex[2];
|
||||
std::vector<double> chi_clamped(chi, chi + all);
|
||||
f_lowerboundset(ex, chi_clamped.data(), chitiny);
|
||||
|
||||
const int ret = compute_rhs_z4c_cartesian(
|
||||
ex, T, X, Y, Z,
|
||||
chi_clamped.data(), chi, trK,
|
||||
dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz,
|
||||
dtSfx, dtSfy, dtSfz,
|
||||
TZ,
|
||||
chi_rhs, trK_rhs,
|
||||
gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
|
||||
Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
|
||||
Gamx_rhs, Gamy_rhs, Gamz_rhs,
|
||||
Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
|
||||
dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
|
||||
TZ_rhs,
|
||||
rho, Sx, Sy, Sz,
|
||||
Sxx, Sxy, Sxz, Syy, Syz, Szz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
|
||||
Symmetry, Lev, eps, co);
|
||||
|
||||
if (ret != 0 || co != 0)
|
||||
return ret;
|
||||
|
||||
#if (ABV == 0)
|
||||
f_ricci_gamma(ex, X, Y, Z,
|
||||
chi,
|
||||
dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
Symmetry);
|
||||
#endif
|
||||
f_constraint_bssn(ex, X, Y, Z,
|
||||
chi, trK,
|
||||
dxx, gxy, gxz, dyy, gyz, dzz,
|
||||
Axx, Axy, Axz, Ayy, Ayz, Azz,
|
||||
Gamx, Gamy, Gamz,
|
||||
Lap, betax, betay, betaz, rho, Sx, Sy, Sz,
|
||||
Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
|
||||
Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
|
||||
Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
|
||||
Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
|
||||
Hcon, Mxcon, Mycon, Mzcon, Gmxcon, Gmycon, Gmzcon,
|
||||
Symmetry);
|
||||
return ret;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,83 +0,0 @@
|
||||
#ifndef Z4C_RHS_CUDA_H
|
||||
#define Z4C_RHS_CUDA_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
enum {
|
||||
Z4C_CUDA_STATE_COUNT = 25
|
||||
};
|
||||
|
||||
int z4c_cuda_rk4_substep(void *block_tag,
|
||||
int *ex, double *X, double *Y, double *Z,
|
||||
double **state_host_in,
|
||||
double **state_host_out,
|
||||
const double *propspeed,
|
||||
const double *soa_flat,
|
||||
const double *bbox,
|
||||
double &dT,
|
||||
double &T,
|
||||
int &RK4,
|
||||
int &apply_bam_bc,
|
||||
int &Symmetry,
|
||||
int &Lev,
|
||||
double &eps,
|
||||
int &co,
|
||||
int &keep_resident_state,
|
||||
int &apply_enforce_ga,
|
||||
double &chitiny);
|
||||
|
||||
int z4c_cuda_download_resident_state(void *block_tag,
|
||||
int *ex,
|
||||
double **state_host_out);
|
||||
|
||||
int z4c_cuda_pack_state_region_to_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int z4c_cuda_unpack_state_region_from_host_buffer(void *block_tag,
|
||||
int state_index,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int z4c_cuda_pack_state_batch_to_host_buffer(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int z4c_cuda_unpack_state_batch_from_host_buffer(void *block_tag,
|
||||
int state_count,
|
||||
double *host_buffer,
|
||||
int *ex,
|
||||
int i0, int j0, int k0,
|
||||
int sx, int sy, int sz);
|
||||
|
||||
int z4c_cuda_download_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host_out);
|
||||
|
||||
int z4c_cuda_upload_state_subset(void *block_tag,
|
||||
int *ex,
|
||||
int subset_count,
|
||||
const int *state_indices,
|
||||
double **state_host_in);
|
||||
|
||||
int z4c_cuda_has_resident_state(void *block_tag);
|
||||
|
||||
void z4c_cuda_release_step_ctx(void *block_tag);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
12
README.md
12
README.md
@@ -93,11 +93,13 @@ Here, we take the Ubuntu 22.04 system as an example
|
||||
|
||||
#### How to use AMSS-NCKU
|
||||
|
||||
0. Setting the parameters for compilation
|
||||
|
||||
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
|
||||
|
||||
The settings for the Ubuntu 22.04 system do not need to be modified.
|
||||
0. Setting the parameters for compilation
|
||||
|
||||
Modify the makefile.inc file in the AMSS_NCKU_source directory and change the settings according to your computer.
|
||||
|
||||
The default configuration in this branch uses GNU compilers through the OpenMPI wrappers under `/usr/lib64/openmpi/bin`.
|
||||
|
||||
If your OpenMPI installation is in another location, update `OMPI_BIN` in `AMSS_NCKU_source/makefile.inc` or export `AMSS_OPENMPI_BIN` before running the Python launcher.
|
||||
|
||||
1. Enter the AMSS-NCKU Python code folder and modify the input.
|
||||
|
||||
|
||||
@@ -204,7 +204,7 @@ def generate_macrodef_h():
|
||||
# use GPU or not
|
||||
|
||||
if ( input_data.GPU_Calculation == "yes"):
|
||||
print( "//#define USE_GPU", file=file1 )
|
||||
print( "#define USE_GPU", file=file1 )
|
||||
print( file=file1 )
|
||||
elif ( input_data.GPU_Calculation == "no"):
|
||||
print( "//#define USE_GPU", file=file1 )
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
|
||||
|
||||
import AMSS_NCKU_Input as input_data
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
@@ -52,6 +53,8 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
|
||||
|
||||
## Build parallelism: match the number of bound cores
|
||||
BUILD_JOBS = 64
|
||||
OPENMPI_BIN = os.environ.get("AMSS_OPENMPI_BIN", "/usr/lib64/openmpi/bin")
|
||||
MPI_RUNNER = os.path.join(OPENMPI_BIN, "mpirun")
|
||||
|
||||
|
||||
##################################################################
|
||||
@@ -70,9 +73,9 @@ def makefile_ABE():
|
||||
|
||||
## Build command with CPU binding to nohz_full cores
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE"
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off ABE"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA"
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} ABEGPU"
|
||||
else:
|
||||
print( " CPU/GPU numerical calculation setting is wrong " )
|
||||
print( )
|
||||
@@ -147,11 +150,11 @@ def run_ABE():
|
||||
## Define the command to run; cast other values to strings as needed
|
||||
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
#mpi_command = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
|
||||
mpi_command_outfile = "ABE_out.log"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA"
|
||||
mpi_command = NUMACTL_CPU_BIND + " " + MPI_RUNNER + " -np " + str(input_data.MPI_processes) + " ./ABEGPU"
|
||||
mpi_command_outfile = "ABEGPU_out.log"
|
||||
|
||||
## Execute the MPI command and stream output
|
||||
|
||||
Reference in New Issue
Block a user