Compare commits
5 Commits
cjy-falcon
...
asc26-temp
| Author | SHA1 | Date | |
|---|---|---|---|
| dd5b7561c1 | |||
| a99534d2f3 | |||
| f2264989d8 | |||
| a0b43bae04 | |||
| c7a48ebe7e |
559
AMSS_NCKU_GPUCheck.py
Normal file
559
AMSS_NCKU_GPUCheck.py
Normal file
@@ -0,0 +1,559 @@
|
||||
#!/usr/bin/env python3
|
||||
#
|
||||
# Current most stable GPU-branch baseline:
|
||||
# GPU_Calculation="yes"
|
||||
# Equation_Class="BSSN"
|
||||
# Initial_Data_Method="Ansorg-TwoPuncture"
|
||||
# puncture_data_set="Manually"
|
||||
# basic_grid_set="Patch"
|
||||
# grid_center_set="Cell"
|
||||
# Symmetry="equatorial-symmetry"
|
||||
# Time_Evolution_Method="runge-kutta-45"
|
||||
# Finite_Diffenence_Method="4th-order"
|
||||
# boundary_choice="BAM-choice"
|
||||
# gauge_choice=0
|
||||
# tetrad_type=2
|
||||
# AHF_Find="no"
|
||||
# devide_factor=2.0
|
||||
# static_grid_type="Linear"
|
||||
# moving_grid_type="Linear"
|
||||
# AMSS_Z4C_MRBD=0
|
||||
# Do not enable AMSS_CUDA_BH_INTERP_RESIDENT unless a dedicated
|
||||
# CPU/GPU trajectory comparison has been run for that configuration.
|
||||
"""
|
||||
Check whether AMSS_NCKU_Input.py is suitable for the current GPU branch.
|
||||
|
||||
Usage:
|
||||
python3 AMSS_NCKU_GPUCheck.py
|
||||
python3 AMSS_NCKU_GPUCheck.py -f /path/to/AMSS_NCKU_Input.py
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import importlib.util
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, List, Sequence
|
||||
|
||||
|
||||
SUPPORTED_EQUATIONS = {"BSSN", "BSSN-EScalar", "BSSN-EM", "Z4C"}
|
||||
SUPPORTED_INITIAL_DATA = {
|
||||
"Ansorg-TwoPuncture",
|
||||
"Lousto-Analytical",
|
||||
"Cao-Analytical",
|
||||
"KerrSchild-Analytical",
|
||||
}
|
||||
SUPPORTED_SYMMETRIES = {
|
||||
"no-symmetry",
|
||||
"equatorial-symmetry",
|
||||
"octant-symmetry",
|
||||
}
|
||||
SUPPORTED_GRIDS = {"Patch", "Shell-Patch"}
|
||||
SUPPORTED_CENTERS = {"Cell", "Vertex"}
|
||||
SUPPORTED_FD = {"2nd-order", "4th-order", "6th-order", "8th-order"}
|
||||
SUPPORTED_GAUGES = {0, 1, 2, 3, 4, 5, 6, 7}
|
||||
SUPPORTED_TETRADS = {0, 1, 2}
|
||||
SUPPORTED_AHF = {"yes", "no"}
|
||||
SUPPORTED_BOUNDARIES = {"BAM-choice", "Shibata-choice"}
|
||||
SUPPORTED_PUNCTURE_DATA = {"Manually", "Automatically-BBH"}
|
||||
|
||||
STABLE_BASELINE = {
|
||||
"GPU_Calculation": "yes",
|
||||
"Equation_Class": "BSSN",
|
||||
"Initial_Data_Method": "Ansorg-TwoPuncture",
|
||||
"puncture_data_set": "Manually",
|
||||
"basic_grid_set": "Patch",
|
||||
"grid_center_set": "Cell",
|
||||
"Symmetry": "equatorial-symmetry",
|
||||
"Time_Evolution_Method": "runge-kutta-45",
|
||||
"Finite_Diffenence_Method": "4th-order",
|
||||
"boundary_choice": "BAM-choice",
|
||||
"gauge_choice": 0,
|
||||
"tetrad_type": 2,
|
||||
"AHF_Find": "no",
|
||||
"devide_factor": 2.0,
|
||||
"static_grid_type": "Linear",
|
||||
"moving_grid_type": "Linear",
|
||||
"AMSS_Z4C_MRBD": 0,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckResult:
|
||||
ok: bool = True
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
risks: List[str] = field(default_factory=list)
|
||||
notes: List[str] = field(default_factory=list)
|
||||
|
||||
def add_warning(self, msg: str) -> None:
|
||||
self.warnings.append(msg)
|
||||
|
||||
def add_risk(self, msg: str) -> None:
|
||||
self.ok = False
|
||||
self.risks.append(msg)
|
||||
|
||||
def add_note(self, msg: str) -> None:
|
||||
self.notes.append(msg)
|
||||
|
||||
def extend_notes(self, messages: Iterable[str]) -> None:
|
||||
self.notes.extend(messages)
|
||||
|
||||
|
||||
def load_input_module(path: Path):
|
||||
spec = importlib.util.spec_from_file_location("amss_ncku_input", str(path))
|
||||
if spec is None or spec.loader is None:
|
||||
raise RuntimeError(f"cannot load input module from {path}")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module) # type: ignore[union-attr]
|
||||
return module
|
||||
|
||||
|
||||
def get_attr(mod: Any, name: str, default: Any = None) -> Any:
|
||||
return getattr(mod, name, default)
|
||||
|
||||
|
||||
def as_text(value: Any) -> str:
|
||||
if isinstance(value, str):
|
||||
return value.strip()
|
||||
return str(value).strip()
|
||||
|
||||
|
||||
def as_lower_text(value: Any) -> str:
|
||||
return as_text(value).lower()
|
||||
|
||||
|
||||
def as_float(value: Any, default: float | None = None) -> float | None:
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def as_int(value: Any, default: int | None = None) -> int | None:
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def sequence_len(value: Any) -> int | None:
|
||||
try:
|
||||
return len(value)
|
||||
except TypeError:
|
||||
return None
|
||||
|
||||
|
||||
def sequence_values(value: Any) -> List[float] | None:
|
||||
try:
|
||||
return [float(v) for v in value]
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def approx_equal(a: Any, b: float, tol: float = 1.0e-12) -> bool:
|
||||
value = as_float(a)
|
||||
return value is not None and abs(value - b) <= tol
|
||||
|
||||
|
||||
def env_truthy(name: str) -> bool:
|
||||
value = os.environ.get(name)
|
||||
return value is not None and value.strip().lower() in {
|
||||
"1",
|
||||
"yes",
|
||||
"y",
|
||||
"true",
|
||||
"on",
|
||||
"enable",
|
||||
"enabled",
|
||||
}
|
||||
|
||||
|
||||
def stable_baseline_differences(mod: Any) -> List[str]:
|
||||
diffs = []
|
||||
for name, expected in STABLE_BASELINE.items():
|
||||
if not hasattr(mod, name):
|
||||
continue
|
||||
actual = get_attr(mod, name, None)
|
||||
if isinstance(expected, float):
|
||||
if not approx_equal(actual, expected):
|
||||
diffs.append(f"{name}={actual!r} (stable baseline: {expected!r})")
|
||||
elif actual != expected:
|
||||
diffs.append(f"{name}={actual!r} (stable baseline: {expected!r})")
|
||||
return diffs
|
||||
|
||||
|
||||
def add_membership_check(
|
||||
r: CheckResult,
|
||||
name: str,
|
||||
value: Any,
|
||||
supported: Sequence[Any] | set[Any],
|
||||
*,
|
||||
risk_message: str | None = None,
|
||||
note_message: str | None = None,
|
||||
) -> None:
|
||||
if value not in supported:
|
||||
r.add_risk(risk_message or f"Unsupported {name}: {value!r}")
|
||||
elif note_message:
|
||||
r.add_note(note_message)
|
||||
|
||||
|
||||
def check_positive_int(r: CheckResult, name: str, value: Any) -> None:
|
||||
parsed = as_int(value)
|
||||
if parsed is None or parsed <= 0:
|
||||
r.add_risk(f"{name} must be a positive integer; got {value!r}")
|
||||
|
||||
|
||||
def check_nonnegative_number(r: CheckResult, name: str, value: Any) -> None:
|
||||
parsed = as_float(value)
|
||||
if parsed is None or parsed < 0.0:
|
||||
r.add_risk(f"{name} must be a non-negative number; got {value!r}")
|
||||
|
||||
|
||||
def check_grid_geometry(r: CheckResult, mod: Any, grid: str) -> None:
|
||||
grid_level = as_int(get_attr(mod, "grid_level", None))
|
||||
static_grid_level = as_int(get_attr(mod, "static_grid_level", None))
|
||||
moving_grid_level = as_int(get_attr(mod, "moving_grid_level", None))
|
||||
refinement_level = as_int(get_attr(mod, "refinement_level", None))
|
||||
analysis_level = as_int(get_attr(mod, "analysis_level", 0))
|
||||
|
||||
for name in (
|
||||
"grid_level",
|
||||
"static_grid_level",
|
||||
"moving_grid_level",
|
||||
"static_grid_number",
|
||||
"moving_grid_number",
|
||||
"quarter_sphere_number",
|
||||
):
|
||||
check_positive_int(r, name, get_attr(mod, name, None))
|
||||
|
||||
if grid_level is not None and static_grid_level is not None:
|
||||
if static_grid_level > grid_level:
|
||||
r.add_risk("static_grid_level cannot exceed grid_level.")
|
||||
if moving_grid_level is not None and moving_grid_level != grid_level - static_grid_level:
|
||||
r.add_risk(
|
||||
"moving_grid_level should equal grid_level - static_grid_level; "
|
||||
f"got {moving_grid_level}, expected {grid_level - static_grid_level}."
|
||||
)
|
||||
if grid_level is not None:
|
||||
if refinement_level is None or refinement_level < 0 or refinement_level > grid_level:
|
||||
r.add_risk(f"refinement_level must be in [0, grid_level]; got {refinement_level!r}")
|
||||
if analysis_level is None or analysis_level < 0 or analysis_level >= grid_level:
|
||||
r.add_risk(f"analysis_level must be in [0, grid_level); got {analysis_level!r}")
|
||||
|
||||
largest_max = sequence_values(get_attr(mod, "largest_box_xyz_max", None))
|
||||
largest_min = sequence_values(get_attr(mod, "largest_box_xyz_min", None))
|
||||
if largest_max is None or len(largest_max) != 3:
|
||||
r.add_risk("largest_box_xyz_max must contain three numeric values.")
|
||||
elif any(v <= 0.0 for v in largest_max):
|
||||
r.add_risk(f"largest_box_xyz_max values must be positive; got {largest_max!r}")
|
||||
if largest_min is None or len(largest_min) != 3:
|
||||
r.add_risk("largest_box_xyz_min must contain three numeric values.")
|
||||
elif largest_max is not None and len(largest_max) == 3:
|
||||
for idx, (lo, hi) in enumerate(zip(largest_min, largest_max)):
|
||||
if lo >= hi:
|
||||
r.add_risk(
|
||||
f"largest_box_xyz_min[{idx}] must be smaller than largest_box_xyz_max[{idx}]."
|
||||
)
|
||||
|
||||
if grid == "Shell-Patch" and largest_max is not None and len(largest_max) == 3:
|
||||
if max(largest_max) - min(largest_max) > 1.0e-12:
|
||||
r.add_risk("Shell-Patch requires a cubic largest_box_xyz_max.")
|
||||
|
||||
if not approx_equal(get_attr(mod, "devide_factor", None), 2.0):
|
||||
r.add_risk("devide_factor must remain 2.0; the AMR code documents only this ratio as supported.")
|
||||
if as_text(get_attr(mod, "static_grid_type", "")) != "Linear":
|
||||
r.add_risk("static_grid_type must remain 'Linear'.")
|
||||
if as_text(get_attr(mod, "moving_grid_type", "")) != "Linear":
|
||||
r.add_risk("moving_grid_type must remain 'Linear'.")
|
||||
|
||||
shell_shape = sequence_values(get_attr(mod, "shell_grid_number", None))
|
||||
if grid == "Shell-Patch":
|
||||
if shell_shape is None or len(shell_shape) != 3:
|
||||
r.add_risk("Shell-Patch requires shell_grid_number with three numeric values.")
|
||||
elif any(int(v) <= 0 for v in shell_shape):
|
||||
r.add_risk(f"shell_grid_number values must be positive; got {shell_shape!r}")
|
||||
|
||||
|
||||
def check_punctures(r: CheckResult, mod: Any, init: str, puncture_data: str) -> None:
|
||||
puncture_number = as_int(get_attr(mod, "puncture_number", None))
|
||||
if puncture_number is None or puncture_number <= 0:
|
||||
r.add_risk(f"puncture_number must be a positive integer; got {puncture_number!r}")
|
||||
return
|
||||
|
||||
if init == "Ansorg-TwoPuncture" and puncture_number != 2:
|
||||
r.add_warning(
|
||||
"Ansorg-TwoPuncture is validated on the GPU branch mainly for puncture_number=2."
|
||||
)
|
||||
if puncture_data == "Automatically-BBH":
|
||||
r.add_risk("puncture_data_set='Automatically-BBH' is documented as still developing.")
|
||||
|
||||
for name in ("position_BH", "parameter_BH", "dimensionless_spin_BH", "momentum_BH"):
|
||||
value = get_attr(mod, name, None)
|
||||
outer = sequence_len(value)
|
||||
if outer != puncture_number:
|
||||
r.add_risk(f"{name} must have puncture_number rows; got {outer!r}.")
|
||||
continue
|
||||
for idx in range(puncture_number):
|
||||
if sequence_len(value[idx]) != 3:
|
||||
r.add_risk(f"{name}[{idx}] must contain three values.")
|
||||
break
|
||||
|
||||
if init == "Ansorg-TwoPuncture":
|
||||
for name in ("parameter_BH", "position_BH", "momentum_BH"):
|
||||
if get_attr(mod, name, None) is None:
|
||||
r.add_risk(f"Ansorg-TwoPuncture requires {name}.")
|
||||
|
||||
|
||||
def check_output_and_time(r: CheckResult, mod: Any) -> None:
|
||||
for name in (
|
||||
"Final_Evolution_Time",
|
||||
"Check_Time",
|
||||
"Dump_Time",
|
||||
"D2_Dump_Time",
|
||||
"Analysis_Time",
|
||||
"Courant_Factor",
|
||||
"Dissipation",
|
||||
):
|
||||
check_nonnegative_number(r, name, get_attr(mod, name, None))
|
||||
check_positive_int(r, "Evolution_Step_Number", get_attr(mod, "Evolution_Step_Number", None))
|
||||
|
||||
start_time = as_float(get_attr(mod, "Start_Evolution_Time", None))
|
||||
final_time = as_float(get_attr(mod, "Final_Evolution_Time", None))
|
||||
if start_time is None:
|
||||
r.add_risk("Start_Evolution_Time must be numeric.")
|
||||
elif final_time is not None and final_time <= start_time:
|
||||
r.add_risk("Final_Evolution_Time must be greater than Start_Evolution_Time.")
|
||||
|
||||
for name in ("GW_L_max", "GW_M_max", "Detector_Number"):
|
||||
check_positive_int(r, name, get_attr(mod, name, None))
|
||||
detector_min = as_float(get_attr(mod, "Detector_Rmin", None))
|
||||
detector_max = as_float(get_attr(mod, "Detector_Rmax", None))
|
||||
if detector_min is None or detector_min <= 0.0:
|
||||
r.add_risk(f"Detector_Rmin must be positive; got {detector_min!r}")
|
||||
if detector_max is None or detector_max <= 0.0:
|
||||
r.add_risk(f"Detector_Rmax must be positive; got {detector_max!r}")
|
||||
if detector_min is not None and detector_max is not None and detector_max <= detector_min:
|
||||
r.add_risk("Detector_Rmax must be greater than Detector_Rmin.")
|
||||
|
||||
|
||||
def check_equation_specific(r: CheckResult, mod: Any, eq: str, grid: str, fd: str) -> None:
|
||||
if eq == "BSSN":
|
||||
r.add_note("Equation_Class=BSSN is the current validated GPU baseline.")
|
||||
elif eq == "BSSN-EScalar":
|
||||
r.add_warning("BSSN-EScalar has a CUDA path, but it is less broadly validated than BSSN.")
|
||||
fr_choice = as_int(get_attr(mod, "FR_Choice", None))
|
||||
if fr_choice not in {1, 2, 3, 4, 5}:
|
||||
r.add_risk(f"FR_Choice must be one of 1..5 for BSSN-EScalar; got {fr_choice!r}")
|
||||
if approx_equal(get_attr(mod, "FR_a2", None), 0.0):
|
||||
r.add_risk("CUDA BSSN-EScalar requires nonzero FR_a2.")
|
||||
elif not approx_equal(get_attr(mod, "FR_a2", None), 3.0):
|
||||
r.add_warning("CUDA BSSN-EScalar now passes FR_a2 to the kernel, but non-3.0 values need CPU/GPU regression.")
|
||||
for name in ("FR_l2", "FR_phi0", "FR_r0", "FR_sigma0"):
|
||||
check_nonnegative_number(r, name, get_attr(mod, name, None))
|
||||
elif eq == "BSSN-EM":
|
||||
r.add_warning(
|
||||
"BSSN-EM is accepted by the build, but this checker cannot certify its physics/output "
|
||||
"without a CPU/GPU regression run."
|
||||
)
|
||||
if fd == "8th-order":
|
||||
r.add_note("BSSN-EM with 8th-order enables extra CUDA AMR batching defaults.")
|
||||
elif eq == "Z4C":
|
||||
r.add_warning(
|
||||
"Z4C has CUDA support, but the resident path and Shell/CPBC combinations are more constrained."
|
||||
)
|
||||
if grid == "Patch":
|
||||
r.add_warning("Z4C+Patch avoids Shell CPBC, but still needs a dedicated regression test.")
|
||||
else:
|
||||
r.add_warning("Z4C+Shell-Patch uses CPBC/Shell logic and is not the stable BSSN baseline.")
|
||||
|
||||
|
||||
def check_runtime_environment(r: CheckResult, mod: Any, eq: str, grid: str, fd: str) -> None:
|
||||
if env_truthy("AMSS_CUDA_BH_INTERP_RESIDENT"):
|
||||
r.add_risk(
|
||||
"AMSS_CUDA_BH_INTERP_RESIDENT is enabled in the environment; this option previously caused "
|
||||
"late-time trajectory drift and should stay off unless explicitly revalidated."
|
||||
)
|
||||
else:
|
||||
r.add_note("AMSS_CUDA_BH_INTERP_RESIDENT is not enabled; this matches the fixed stable default.")
|
||||
|
||||
if eq in {"BSSN", "BSSN-EScalar", "Z4C"}:
|
||||
r.add_note("makefile_and_run.py will default AMSS_CUDA_AMR_RESTRICT_DEVICE=1 for this equation.")
|
||||
if fd in {"2nd-order", "8th-order"}:
|
||||
r.add_warning(
|
||||
f"{fd} disables some interpolation/CUDA-aware MPI fast paths by default; validate performance and output."
|
||||
)
|
||||
if grid == "Shell-Patch":
|
||||
r.add_warning(
|
||||
"Shell-Patch changes runtime defaults and MPI process handling; use at least the script-adjusted 4 MPI ranks."
|
||||
)
|
||||
|
||||
z4c_mrbd = as_int(get_attr(mod, "AMSS_Z4C_MRBD", 0), 0)
|
||||
if z4c_mrbd not in {0, 1, 2}:
|
||||
r.add_risk(f"AMSS_Z4C_MRBD must be 0, 1, or 2; got {z4c_mrbd!r}")
|
||||
elif eq == "Z4C" and z4c_mrbd == 2:
|
||||
r.add_risk("Z4C GPU resident path does not support AMSS_Z4C_MRBD=2.")
|
||||
elif eq == "Z4C" and z4c_mrbd in {0, 1}:
|
||||
r.add_note(f"Z4C will build with AMSS_Z4C_MRBD={z4c_mrbd}.")
|
||||
|
||||
|
||||
def check_stable_profile(r: CheckResult, mod: Any) -> None:
|
||||
diffs = stable_baseline_differences(mod)
|
||||
if not diffs:
|
||||
r.add_note("This input matches the documented most stable GPU baseline.")
|
||||
return
|
||||
r.add_warning(
|
||||
"This input differs from the documented most stable GPU baseline: " + "; ".join(diffs)
|
||||
)
|
||||
|
||||
|
||||
def check_input(mod: Any) -> CheckResult:
|
||||
r = CheckResult()
|
||||
|
||||
gpu_text = as_lower_text(get_attr(mod, "GPU_Calculation", "no"))
|
||||
gpu = gpu_text == "yes"
|
||||
eq = as_text(get_attr(mod, "Equation_Class", ""))
|
||||
init = as_text(get_attr(mod, "Initial_Data_Method", ""))
|
||||
symmetry = as_text(get_attr(mod, "Symmetry", ""))
|
||||
time_method = as_text(get_attr(mod, "Time_Evolution_Method", ""))
|
||||
grid = as_text(get_attr(mod, "basic_grid_set", ""))
|
||||
center = as_text(get_attr(mod, "grid_center_set", ""))
|
||||
fd = as_text(get_attr(mod, "Finite_Diffenence_Method", ""))
|
||||
gauge = get_attr(mod, "gauge_choice", None)
|
||||
tetrad = get_attr(mod, "tetrad_type", None)
|
||||
ahf = as_text(get_attr(mod, "AHF_Find", "no")).lower()
|
||||
boundary = as_text(get_attr(mod, "boundary_choice", ""))
|
||||
puncture_data = as_text(get_attr(mod, "puncture_data_set", ""))
|
||||
cpu_part = get_attr(mod, "CPU_Part", None)
|
||||
gpu_part = get_attr(mod, "GPU_Part", None)
|
||||
|
||||
if gpu_text not in {"yes", "no"}:
|
||||
r.add_risk(f"GPU_Calculation must be 'yes' or 'no'; got {get_attr(mod, 'GPU_Calculation', None)!r}")
|
||||
if not gpu:
|
||||
r.add_note("GPU_Calculation=no; this check only targets the GPU branch.")
|
||||
return r
|
||||
|
||||
r.add_note("GPU_Calculation=yes detected.")
|
||||
|
||||
add_membership_check(r, "Equation_Class", eq, SUPPORTED_EQUATIONS)
|
||||
add_membership_check(r, "Symmetry", symmetry, SUPPORTED_SYMMETRIES)
|
||||
add_membership_check(r, "Initial_Data_Method", init, SUPPORTED_INITIAL_DATA)
|
||||
add_membership_check(r, "basic_grid_set", grid, SUPPORTED_GRIDS)
|
||||
add_membership_check(r, "grid_center_set", center, SUPPORTED_CENTERS)
|
||||
add_membership_check(r, "Finite_Diffenence_Method", fd, SUPPORTED_FD)
|
||||
add_membership_check(r, "gauge_choice", gauge, SUPPORTED_GAUGES)
|
||||
add_membership_check(r, "tetrad_type", tetrad, SUPPORTED_TETRADS)
|
||||
add_membership_check(r, "AHF_Find", ahf, SUPPORTED_AHF)
|
||||
add_membership_check(r, "boundary_choice", boundary, SUPPORTED_BOUNDARIES)
|
||||
add_membership_check(r, "puncture_data_set", puncture_data, SUPPORTED_PUNCTURE_DATA)
|
||||
|
||||
if init != "Ansorg-TwoPuncture":
|
||||
r.add_risk(
|
||||
f"Initial_Data_Method={init!r} is not validated as safe on this GPU branch; "
|
||||
"the stable path is Ansorg-TwoPuncture."
|
||||
)
|
||||
else:
|
||||
r.add_note("Initial_Data_Method=Ansorg-TwoPuncture is supported.")
|
||||
|
||||
if time_method != "runge-kutta-45":
|
||||
r.add_risk(f"Only Time_Evolution_Method='runge-kutta-45' is supported; got {time_method!r}.")
|
||||
if grid == "Patch":
|
||||
r.add_note("basic_grid_set=Patch is the current stable GPU grid path.")
|
||||
elif grid == "Shell-Patch":
|
||||
r.add_warning("basic_grid_set=Shell-Patch has GPU support but is outside the stable BSSN baseline.")
|
||||
if center == "Vertex":
|
||||
r.add_warning("grid_center_set=Vertex is compiled by macros, but the stable GPU baseline is Cell.")
|
||||
if symmetry != "equatorial-symmetry":
|
||||
r.add_warning("The stable validation case uses equatorial-symmetry; other symmetries need regression tests.")
|
||||
if fd != "4th-order":
|
||||
r.add_warning("The stable validation case uses 4th-order finite differences.")
|
||||
if gauge not in {0, 1}:
|
||||
r.add_warning("Input comments recommend gauge_choice 0 or 1; other gauges need dedicated validation.")
|
||||
if tetrad != 2:
|
||||
r.add_warning("Input comments recommend tetrad_type=2; other tetrads affect wave extraction conventions.")
|
||||
|
||||
if ahf == "yes":
|
||||
r.add_warning("AHF_Find=yes is supported by macros, but it is outside the current stable GPU baseline.")
|
||||
|
||||
if boundary == "Shibata-choice":
|
||||
r.add_risk("Shibata-choice is not faithfully distinguished in the current macro generator; it maps to the BAM branch.")
|
||||
elif boundary == "BAM-choice":
|
||||
r.add_note("boundary_choice=BAM-choice is supported.")
|
||||
|
||||
if cpu_part is not None or gpu_part is not None:
|
||||
r.add_warning("CPU_Part/GPU_Part are printed and propagated, but they do not control a real mixed CPU/GPU split in this branch.")
|
||||
|
||||
check_output_and_time(r, mod)
|
||||
check_grid_geometry(r, mod, grid)
|
||||
check_punctures(r, mod, init, puncture_data)
|
||||
check_equation_specific(r, mod, eq, grid, fd)
|
||||
check_runtime_environment(r, mod, eq, grid, fd)
|
||||
check_stable_profile(r, mod)
|
||||
|
||||
return r
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--file",
|
||||
"--input",
|
||||
dest="input_file",
|
||||
default="AMSS_NCKU_Input.py",
|
||||
help="path to AMSS_NCKU_Input.py",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
path = Path(args.input_file).resolve()
|
||||
if not path.exists():
|
||||
print(f"ERROR: input file not found: {path}")
|
||||
return 2
|
||||
|
||||
try:
|
||||
mod = load_input_module(path)
|
||||
except Exception as exc:
|
||||
print(f"ERROR: failed to load input file: {exc}")
|
||||
return 2
|
||||
|
||||
result = check_input(mod)
|
||||
|
||||
print(f"Input: {path}")
|
||||
print(f"GPU_Calculation: {get_attr(mod, 'GPU_Calculation', 'no')}")
|
||||
print(f"Symmetry: {get_attr(mod, 'Symmetry', '')}")
|
||||
print(f"Equation_Class: {get_attr(mod, 'Equation_Class', '')}")
|
||||
print(f"Initial_Data_Method: {get_attr(mod, 'Initial_Data_Method', '')}")
|
||||
print(f"puncture_data_set: {get_attr(mod, 'puncture_data_set', '')}")
|
||||
print(f"basic_grid_set: {get_attr(mod, 'basic_grid_set', '')}")
|
||||
print(f"grid_center_set: {get_attr(mod, 'grid_center_set', '')}")
|
||||
print(f"Finite_Diffenence_Method: {get_attr(mod, 'Finite_Diffenence_Method', '')}")
|
||||
print(f"gauge_choice: {get_attr(mod, 'gauge_choice', '')}")
|
||||
print(f"tetrad_type: {get_attr(mod, 'tetrad_type', '')}")
|
||||
print(f"boundary_choice: {get_attr(mod, 'boundary_choice', '')}")
|
||||
print(f"AHF_Find: {get_attr(mod, 'AHF_Find', '')}")
|
||||
print(f"AMSS_Z4C_MRBD: {get_attr(mod, 'AMSS_Z4C_MRBD', 0)}")
|
||||
print("")
|
||||
|
||||
for msg in result.notes:
|
||||
print(f"NOTE: {msg}")
|
||||
for msg in result.warnings:
|
||||
print(f"WARNING: {msg}")
|
||||
for msg in result.risks:
|
||||
print(f"RISK: {msg}")
|
||||
|
||||
print("")
|
||||
if result.risks:
|
||||
print("Verdict: review the risks above before running.")
|
||||
return 1
|
||||
|
||||
if result.warnings:
|
||||
print("Verdict: runnable on the current GPU branch, but keep the warnings in mind.")
|
||||
return 0
|
||||
|
||||
print("Verdict: OK to run on the current GPU branch.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -13,15 +13,31 @@ import numpy
|
||||
|
||||
## Setting MPI processes and the output file directory
|
||||
|
||||
File_directory = "GW150914" ## output file directory
|
||||
File_directory = "case3" ## output file directory
|
||||
Output_directory = "binary_output" ## binary data file directory
|
||||
## The file directory name should not be too long
|
||||
MPI_processes = 2 ## number of mpi processes used in the simulation
|
||||
|
||||
GPU_Calculation = "yes" ## Use GPU or not
|
||||
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
|
||||
CPU_Part = 1.0
|
||||
GPU_Part = 0.0
|
||||
MPI_processes = 2 ## number of mpi processes used in the simulation
|
||||
|
||||
GPU_Calculation = "yes" ## Use GPU or not
|
||||
## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
|
||||
CPU_Part = 1.0
|
||||
GPU_Part = 0.0
|
||||
|
||||
## Aggressive runtime overrides for fastest low-accuracy GPU runs.
|
||||
AMSS_EVOLVE_TIMING = 0
|
||||
AMSS_ANALYSIS_MAP_EVERY = 1000000000
|
||||
AMSS_INTERP_FAST = 1
|
||||
AMSS_INTERP_GPU = 1
|
||||
AMSS_CUDA_AWARE_MPI = 1
|
||||
AMSS_CUDA_RESIDENT_SYNC = 1
|
||||
AMSS_CUDA_BSSN_RESIDENT_SYNC = 1
|
||||
AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP = 1
|
||||
AMSS_CUDA_KEEP_ALL_LEVELS = 1
|
||||
AMSS_CUDA_AMR_RESTRICT_DEVICE = 1
|
||||
AMSS_CUDA_AMR_RESTRICT_BATCH = 1
|
||||
AMSS_CUDA_DEVICE_SEGMENT_BATCH = 1
|
||||
AMSS_CUDA_UNCACHED_DEVICE_BUFFERS = 1
|
||||
AMSS_CUDA_AMR_HOST_STAGED = 1
|
||||
|
||||
#################################################
|
||||
|
||||
@@ -45,14 +61,14 @@ Finite_Diffenence_Method = "4th-order" ## finite-difference method:
|
||||
## Setting the time evolutionary information
|
||||
|
||||
Start_Evolution_Time = 0.0 ## start evolution time t0
|
||||
Final_Evolution_Time = 1000.0 ## final evolution time t1
|
||||
Check_Time = 100.0
|
||||
Dump_Time = 100.0 ## time inteval dT for dumping binary data
|
||||
D2_Dump_Time = 100.0 ## dump the ascii data for 2d surface after dT'
|
||||
Analysis_Time = 0.1 ## dump the puncture position and GW psi4 after dT"
|
||||
Evolution_Step_Number = 10000000 ## stop the calculation after the maximal step number
|
||||
Courant_Factor = 0.5 ## Courant Factor
|
||||
Dissipation = 0.15 ## Kreiss-Oliger Dissipation Strength
|
||||
Final_Evolution_Time = 200.0 ## final evolution time t1
|
||||
Check_Time = 1000000000.0
|
||||
Dump_Time = 1000000000.0 ## time inteval dT for dumping binary data
|
||||
D2_Dump_Time = 1000000000.0 ## dump the ascii data for 2d surface after dT'
|
||||
Analysis_Time = 1000000000.0 ## dump the puncture position and GW psi4 after dT"
|
||||
Evolution_Step_Number = 1000000 ## stop the calculation after the maximal step number
|
||||
Courant_Factor = 0.8 ## Courant Factor
|
||||
Dissipation = 0.15 ## Kreiss-Oliger Dissipation Strength
|
||||
|
||||
#################################################
|
||||
|
||||
@@ -64,22 +80,22 @@ Dissipation = 0.15 ## Kreiss-Oliger Dissipation S
|
||||
basic_grid_set = "Patch" ## grid structure: choose "Patch" or "Shell-Patch"
|
||||
grid_center_set = "Cell" ## grid center: chose "Cell" or "Vertex"
|
||||
|
||||
grid_level = 9 ## total number of AMR grid levels
|
||||
static_grid_level = 5 ## number of AMR static grid levels
|
||||
moving_grid_level = grid_level - static_grid_level ## number of AMR moving grid levels
|
||||
|
||||
analysis_level = 0
|
||||
refinement_level = 3 ## time refinement start from this grid level
|
||||
grid_level = 7 ## total number of AMR grid levels
|
||||
static_grid_level = 4 ## number of AMR static grid levels
|
||||
moving_grid_level = grid_level - static_grid_level ## number of AMR moving grid levels
|
||||
|
||||
analysis_level = 0
|
||||
refinement_level = 2 ## time refinement start from this grid level
|
||||
|
||||
largest_box_xyz_max = [320.0, 320.0, 320.0] ## scale of the largest box
|
||||
## not ne cess ary to be cubic for "Patch" grid s tructure
|
||||
## need to be a cubic box for "Shell-Patch" grid structure
|
||||
largest_box_xyz_min = - numpy.array(largest_box_xyz_max)
|
||||
|
||||
static_grid_number = 96 ## grid points of each static AMR grid (in x direction)
|
||||
## (grid points in y and z directions are automatically adjusted)
|
||||
moving_grid_number = 48 ## grid points of each moving AMR grid
|
||||
shell_grid_number = [32, 32, 100] ## grid points of Shell-Patch grid
|
||||
static_grid_number = 64 ## grid points of each static AMR grid (in x direction)
|
||||
## (grid points in y and z directions are automatically adjusted)
|
||||
moving_grid_number = 32 ## grid points of each moving AMR grid
|
||||
shell_grid_number = [32, 32, 100] ## grid points of Shell-Patch grid
|
||||
## in (phi, theta, r) direction
|
||||
devide_factor = 2.0 ## resolution between different grid levels dh0/dh1, only support 2.0 now
|
||||
|
||||
@@ -87,7 +103,7 @@ devide_factor = 2.0 ## resolution between diffe
|
||||
static_grid_type = 'Linear' ## AMR static grid structure , only supports "Linear"
|
||||
moving_grid_type = 'Linear' ## AMR moving grid structure , only supports "Linear"
|
||||
|
||||
quarter_sphere_number = 96 ## grid number of 1/4 s pher ical surface
|
||||
quarter_sphere_number = 16 ## grid number of 1/4 s pher ical surface
|
||||
## (which is needed for evaluating the spherical surface integral)
|
||||
|
||||
#################################################
|
||||
@@ -110,15 +126,15 @@ puncture_data_set = "Manually" ## Method to give Punct
|
||||
|
||||
## initial orbital distance and ellipticity for BBHs system
|
||||
## ( needed for "Automatically-BBH" case , not affect the "Manually" case )
|
||||
Distance = 10.0
|
||||
Distance = 12.0
|
||||
e0 = 0.0
|
||||
|
||||
## black hole parameter (M Q* a*)
|
||||
parameter_BH[0] = [ 36.0/(36.0+29.0), 0.0, +0.31 ]
|
||||
parameter_BH[1] = [ 29.0/(36.0+29.0), 0.0, -0.46 ]
|
||||
parameter_BH[0] = [ 0.5, 0.0, 0.0 ]
|
||||
parameter_BH[1] = [ 0.5, 0.0, 0.0 ]
|
||||
## dimensionless spin in each direction
|
||||
dimensionless_spin_BH[0] = [ 0.0, 0.0, +0.31 ]
|
||||
dimensionless_spin_BH[1] = [ 0.0, 0.0, -0.46 ]
|
||||
dimensionless_spin_BH[0] = [ 0.0, 0.0, 0.0 ]
|
||||
dimensionless_spin_BH[1] = [ 0.0, 0.0, 0.0 ]
|
||||
|
||||
## use Brugmann's convention
|
||||
## -----0-----> y
|
||||
@@ -129,13 +145,13 @@ dimensionless_spin_BH[1] = [ 0.0, 0.0, -0.46 ]
|
||||
## If puncture_data_set is chosen to be "Manually", it is necessary to set the position and momentum of each puncture manually
|
||||
|
||||
## initial position for each puncture
|
||||
position_BH[0] = [ 0.0, 10.0*29.0/(36.0+29.0), 0.0 ]
|
||||
position_BH[1] = [ 0.0, -10.0*36.0/(36.0+29.0), 0.0 ]
|
||||
position_BH[0] = [ 0.0, 6.0, 0.0 ]
|
||||
position_BH[1] = [ 0.0, -6.0, 0.0 ]
|
||||
|
||||
## initial mumentum for each puncture
|
||||
## (needed for "Manually" case, does not affect the "Automatically-BBH" case)
|
||||
momentum_BH[0] = [ -0.09530152296974252, -0.00084541526517121, 0.0 ]
|
||||
momentum_BH[1] = [ +0.09530152296974252, +0.00084541526517121, 0.0 ]
|
||||
momentum_BH[0] = [ -0.06, -0.01, 0.0 ]
|
||||
momentum_BH[1] = [ +0.06, +0.01, 0.0 ]
|
||||
|
||||
|
||||
#################################################
|
||||
@@ -145,11 +161,11 @@ momentum_BH[1] = [ +0.09530152296974252, +0.00084541526517121, 0.0 ]
|
||||
|
||||
## Setting the gravitational wave information
|
||||
|
||||
GW_L_max = 4 ## maximal L number in gravitational wave
|
||||
GW_M_max = 4 ## maximal M number in gravitational wave
|
||||
Detector_Number = 12 ## number of dector
|
||||
GW_L_max = 2 ## maximal L number in gravitational wave
|
||||
GW_M_max = 2 ## maximal M number in gravitational wave
|
||||
Detector_Number = 2 ## number of dector
|
||||
Detector_Rmin = 50.0 ## nearest dector distance
|
||||
Detector_Rmax = 160.0 ## farest dector distance
|
||||
Detector_Rmax = 100.0 ## farest dector distance
|
||||
|
||||
#################################################
|
||||
|
||||
@@ -158,10 +174,10 @@ Detector_Rmax = 160.0 ## farest dector distance
|
||||
|
||||
## Setting the apprent horizon
|
||||
|
||||
AHF_Find = "yes" ## whether to find the apparent horizon: choose "yes" or "no"
|
||||
AHF_Find = "no" ## whether to find the apparent horizon: choose "yes" or "no"
|
||||
|
||||
AHF_Find_Every = 24
|
||||
AHF_Dump_Time = 20.0
|
||||
AHF_Find_Every = 1000000000
|
||||
AHF_Dump_Time = 1000000000.0
|
||||
|
||||
#################################################
|
||||
|
||||
|
||||
@@ -262,7 +262,10 @@ Z4c_class::~Z4c_class()
|
||||
|
||||
//================================================================================================
|
||||
|
||||
#define MRBD 0 // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
|
||||
#ifndef AMSS_Z4C_MRBD
|
||||
#define AMSS_Z4C_MRBD 0
|
||||
#endif
|
||||
#define MRBD AMSS_Z4C_MRBD // 0: fix BD for meshrefinement level; 1: sommerfeld_bam for them; 2: sommerfeld_yo for them
|
||||
|
||||
#ifndef CPBC
|
||||
// for sommerfeld boundary
|
||||
|
||||
@@ -318,6 +318,16 @@ void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
|
||||
|
||||
bool bssn_em_cuda_use_resident_sync(int lev)
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
|
||||
if (!env)
|
||||
env = getenv("AMSS_CUDA_EM_RESIDENT_SYNC");
|
||||
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
||||
}
|
||||
if (!enabled)
|
||||
return false;
|
||||
#ifdef WithShell
|
||||
(void)lev;
|
||||
return false;
|
||||
|
||||
@@ -65,6 +65,16 @@ bool fill_bssn_escalar_cuda_views(Block *cg, MyList<var> *vars,
|
||||
|
||||
bool bssn_escalar_cuda_use_resident_sync(int lev)
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
|
||||
if (!env)
|
||||
env = getenv("AMSS_CUDA_ESCALAR_RESIDENT_SYNC");
|
||||
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
||||
}
|
||||
if (!enabled)
|
||||
return false;
|
||||
#ifdef WithShell
|
||||
(void)lev;
|
||||
return false;
|
||||
@@ -194,7 +204,7 @@ bool bssn_escalar_cuda_bh_interp_resident_enabled()
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
|
||||
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
||||
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
|
||||
@@ -552,6 +552,16 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
|
||||
|
||||
bool bssn_cuda_use_resident_sync(int lev)
|
||||
{
|
||||
static int enabled = -1;
|
||||
if (enabled < 0)
|
||||
{
|
||||
const char *env = getenv("AMSS_CUDA_RESIDENT_SYNC");
|
||||
if (!env)
|
||||
env = getenv("AMSS_CUDA_BSSN_RESIDENT_SYNC");
|
||||
enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
|
||||
}
|
||||
if (!enabled)
|
||||
return false;
|
||||
(void)lev;
|
||||
return true;
|
||||
}
|
||||
@@ -1021,7 +1031,9 @@ void bssn_cuda_sync_level_bh_fields(MyList<Patch> *PatL,
|
||||
while (BP)
|
||||
{
|
||||
Block *cg = BP->data;
|
||||
if (myrank == cg->rank && !bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
|
||||
if (myrank == cg->rank &&
|
||||
bssn_cuda_has_resident_state(cg) &&
|
||||
!bssn_cuda_sync_bh_fields(cg, forx, fory, forz, false))
|
||||
{
|
||||
cout << "CUDA BH state subset download failed" << endl;
|
||||
MPI_Abort(MPI_COMM_WORLD, 1);
|
||||
@@ -1057,13 +1069,8 @@ bool bssn_cuda_bh_interp_resident_enabled()
|
||||
const char *env = getenv("AMSS_CUDA_BH_INTERP_RESIDENT");
|
||||
if (env)
|
||||
enabled = (atoi(env) != 0) ? 1 : 0;
|
||||
#if (ABEtype == 1)
|
||||
else
|
||||
enabled = 1;
|
||||
#else
|
||||
else
|
||||
enabled = 1;
|
||||
#endif
|
||||
enabled = 0;
|
||||
}
|
||||
return enabled != 0;
|
||||
}
|
||||
@@ -8594,6 +8601,23 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
|
||||
{
|
||||
const int InList = 3;
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
const bool use_resident_bh_interp = bssn_cuda_bh_interp_resident_enabled();
|
||||
if (!use_resident_bh_interp && bssn_cuda_use_resident_sync(ilev))
|
||||
{
|
||||
MyList<var> *host_state_list = 0;
|
||||
if (forx == Sfx0 && fory == Sfy0 && forz == Sfz0)
|
||||
host_state_list = StateList;
|
||||
else if (forx == Sfx && fory == Sfy && forz == Sfz)
|
||||
host_state_list = SynchList_pre;
|
||||
else if (forx == Sfx1 && fory == Sfy1 && forz == Sfz1)
|
||||
host_state_list = SynchList_cor;
|
||||
|
||||
if (host_state_list)
|
||||
bssn_cuda_download_level_state_if_present(GH->PatL[ilev], host_state_list, myrank);
|
||||
}
|
||||
#endif
|
||||
|
||||
MyList<var> *DG_List = new MyList<var>(forx);
|
||||
DG_List->insert(fory);
|
||||
DG_List->insert(forz);
|
||||
@@ -8614,7 +8638,7 @@ void bssn_class::compute_Porg_rhs(double **BH_PS, double **BH_RHS, var *forx, va
|
||||
int lev = ilev;
|
||||
|
||||
#if USE_CUDA_BSSN
|
||||
if (bssn_cuda_bh_interp_resident_enabled() &&
|
||||
if (use_resident_bh_interp &&
|
||||
bssn_cuda_use_resident_sync(lev) &&
|
||||
bssn_cuda_interp_bh_point_resident(GH->PatL[lev], myrank, BH_PS[n], forx, fory, forz, Symmetry, shellf))
|
||||
{
|
||||
|
||||
@@ -2792,12 +2792,13 @@ void kern_escalar_sources(
|
||||
double * __restrict__ Sxz,
|
||||
double * __restrict__ Syy,
|
||||
double * __restrict__ Syz,
|
||||
double * __restrict__ Szz)
|
||||
double * __restrict__ Szz,
|
||||
double escalar_a2)
|
||||
{
|
||||
constexpr double PI_V = 3.141592653589793238462643383279502884;
|
||||
constexpr double TWO = 2.0;
|
||||
constexpr double HALF = 0.5;
|
||||
constexpr double A2 = 3.0;
|
||||
const double A2 = escalar_a2;
|
||||
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
i < d_gp.all;
|
||||
@@ -2852,7 +2853,7 @@ void kern_escalar_sources(
|
||||
}
|
||||
}
|
||||
|
||||
static void gpu_escalar_sources(int all)
|
||||
static void gpu_escalar_sources(int all, double escalar_a2)
|
||||
{
|
||||
#define D(s) g_buf.slot[s]
|
||||
gpu_fderivs(D(S_Sphi), D(S_Sphi_x), D(S_Sphi_y), D(S_Sphi_z), 1.0, 1.0, 1.0, all);
|
||||
@@ -2872,7 +2873,8 @@ static void gpu_escalar_sources(int all)
|
||||
D(S_Sphi_yy), D(S_Sphi_yz), D(S_Sphi_zz),
|
||||
D(S_Sphi_rhs), D(S_Spi_rhs),
|
||||
D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz),
|
||||
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
|
||||
D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz),
|
||||
escalar_a2);
|
||||
#undef D
|
||||
}
|
||||
|
||||
@@ -6571,7 +6573,8 @@ static int active_or_keyed_bank(StepContext &ctx,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false)
|
||||
static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escalar = false,
|
||||
double escalar_a2 = 3.0)
|
||||
{
|
||||
const double SYM = 1.0;
|
||||
const double ANTI = -1.0;
|
||||
@@ -6652,7 +6655,7 @@ static void launch_rhs_pipeline(int all, double eps, int co, bool compute_escala
|
||||
D(S_gupyy), D(S_gupyz), D(S_gupzz));
|
||||
|
||||
if (compute_escalar) {
|
||||
gpu_escalar_sources(all);
|
||||
gpu_escalar_sources(all, escalar_a2);
|
||||
gpu_fderivs(D(S_trK), D(S_trK_x), D(S_trK_y), D(S_trK_z), SYM, SYM, SYM, all);
|
||||
}
|
||||
|
||||
@@ -7127,9 +7130,8 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
|
||||
#ifdef fortran3
|
||||
set_escalar_parameter_(escalar_a2, escalar_phi0, escalar_r0, escalar_sigma0, escalar_l2);
|
||||
#endif
|
||||
if (fabs(escalar_a2 - 3.0) > 1.0e-12 && g_dispatch.my_rank == 0) {
|
||||
fprintf(stderr, "CUDA BSSN-EScalar currently supports FR a2=3 for EScalar_CC=2/3; got %.17g\n",
|
||||
escalar_a2);
|
||||
if (fabs(escalar_a2) <= 1.0e-300 && g_dispatch.my_rank == 0) {
|
||||
fprintf(stderr, "CUDA BSSN-EScalar requires nonzero FR a2; got %.17g\n", escalar_a2);
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -7187,7 +7189,7 @@ int bssn_escalar_cuda_rk4_substep(void *block_tag,
|
||||
}
|
||||
}
|
||||
|
||||
launch_rhs_pipeline((int)all, eps, co, true);
|
||||
launch_rhs_pipeline((int)all, eps, co, true, escalar_a2);
|
||||
|
||||
if (apply_bam_bc) {
|
||||
for (int i = 0; i < BSSN_ESCALAR_STATE_COUNT; ++i) {
|
||||
@@ -7250,7 +7252,7 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
|
||||
const size_t bytes = all * sizeof(double);
|
||||
setup_grid_params(ex, X, Y, Z, Symmetry, eps, 0);
|
||||
upload_escalar_state_inputs(state_host_in, all);
|
||||
launch_rhs_pipeline((int)all, eps, 0, true);
|
||||
launch_rhs_pipeline((int)all, eps, 0, true, escalar_a2);
|
||||
|
||||
#define D(s) g_buf.slot[s]
|
||||
kern_escalar_constraint_fr<<<grid(all), BLK>>>(
|
||||
@@ -7693,15 +7695,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest
|
||||
{
|
||||
double s = 1.0;
|
||||
if (x < 0) {
|
||||
x = -x;
|
||||
x = -x - 1;
|
||||
s *= d_comm_state_soa[3 * state_index + 0];
|
||||
}
|
||||
if (y < 0) {
|
||||
y = -y;
|
||||
y = -y - 1;
|
||||
s *= d_comm_state_soa[3 * state_index + 1];
|
||||
}
|
||||
if (z < 0) {
|
||||
z = -z;
|
||||
z = -z - 1;
|
||||
s *= d_comm_state_soa[3 * state_index + 2];
|
||||
}
|
||||
const int src = x + y * nx + z * nx * ny;
|
||||
|
||||
@@ -18,9 +18,9 @@ OMP_FLAG = -qopenmp
|
||||
|
||||
ifeq ($(PGO_MODE),instrument)
|
||||
## Intel Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
|
||||
CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
CXXAPPFLAGS = -O3 -march=znver5 -fma -fprofile-instr-generate -ipo \
|
||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
|
||||
f90appflags = -O3 -march=znver5 -fma -fprofile-instr-generate -ipo \
|
||||
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
||||
else
|
||||
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
|
||||
@@ -28,23 +28,23 @@ else
|
||||
## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization
|
||||
|
||||
|
||||
CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
CXXAPPFLAGS = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \
|
||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
f90appflags = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \
|
||||
-align array64byte -fpp $(MKL_INC) $(POLINT6_FLAG)
|
||||
endif
|
||||
|
||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
TP_OPTFLAGS = -O3 -march=znver5 -fp-model fast=2 -fma -ipo \
|
||||
-Dfortran3 -Dnewc $(MKL_INC)
|
||||
else
|
||||
## NVHPC defaults: mpicc/mpicxx/mpifort wrappers
|
||||
## PGO_MODE is ignored in this branch.
|
||||
OMP_FLAG = -mp
|
||||
CXXAPPFLAGS = -O3 -tp=host -Mcache_align -Mfma \
|
||||
CXXAPPFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
|
||||
-Dfortran3 -Dnewc $(MKL_INC) $(INTERP_LB_FLAGS)
|
||||
f90appflags = -O3 -tp=host -Mcache_align -Mfma -Mpreprocess \
|
||||
f90appflags = -O3 -march=znver5 -tp=host -Mcache_align -Mfma -Mpreprocess \
|
||||
$(MKL_INC) $(POLINT6_FLAG)
|
||||
TP_OPTFLAGS = -O3 -tp=host -Mcache_align -Mfma \
|
||||
TP_OPTFLAGS = -O3 -march=znver5 -tp=host -Mcache_align -Mfma \
|
||||
-Dfortran3 -Dnewc $(MKL_INC)
|
||||
endif
|
||||
|
||||
@@ -111,16 +111,19 @@ TwoPunctureABE.o: TwoPunctureABE.C
|
||||
|
||||
# Input files
|
||||
|
||||
## CUDA BSSN RHS switch
|
||||
## 1 : use the rewritten CUDA bssn_rhs backend
|
||||
## 0 : keep the normal CPU/Fortran selection below
|
||||
USE_CUDA_BSSN ?= 0
|
||||
USE_CUDA_Z4C ?= 0
|
||||
|
||||
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
|
||||
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
|
||||
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
|
||||
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
|
||||
## CUDA BSSN RHS switch
|
||||
## 1 : use the rewritten CUDA bssn_rhs backend
|
||||
## 0 : keep the normal CPU/Fortran selection below
|
||||
USE_CUDA_BSSN ?= 0
|
||||
USE_CUDA_Z4C ?= 0
|
||||
AMSS_Z4C_MRBD ?= 0
|
||||
|
||||
CXXAPPFLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
|
||||
CUDA_APP_FLAGS += -DUSE_CUDA_BSSN=$(USE_CUDA_BSSN)
|
||||
CXXAPPFLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
|
||||
CUDA_APP_FLAGS += -DUSE_CUDA_Z4C=$(USE_CUDA_Z4C)
|
||||
CXXAPPFLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD)
|
||||
CUDA_APP_FLAGS += -DAMSS_Z4C_MRBD=$(AMSS_Z4C_MRBD)
|
||||
|
||||
## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran)
|
||||
ifeq ($(USE_CXX_KERNELS),0)
|
||||
|
||||
@@ -5206,15 +5206,15 @@ __device__ __forceinline__ double load_comm_state_cell_sym(const double * __rest
|
||||
{
|
||||
double s = 1.0;
|
||||
if (x < 0) {
|
||||
x = -x;
|
||||
x = -x - 1;
|
||||
s *= d_comm_state_soa[3 * state_index + 0];
|
||||
}
|
||||
if (y < 0) {
|
||||
y = -y;
|
||||
y = -y - 1;
|
||||
s *= d_comm_state_soa[3 * state_index + 1];
|
||||
}
|
||||
if (z < 0) {
|
||||
z = -z;
|
||||
z = -z - 1;
|
||||
s *= d_comm_state_soa[3 * state_index + 2];
|
||||
}
|
||||
const int src = x + y * nx + z * nx * ny;
|
||||
|
||||
224
code_modification_readme.md
Normal file
224
code_modification_readme.md
Normal file
@@ -0,0 +1,224 @@
|
||||
# Code Modification Readme — `asc26-plan-a`
|
||||
|
||||
**Baseline branch:** `baseline`
|
||||
**Target branch:** `asc26-plan-a`
|
||||
**Date:** 2026-05-19
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
This branch delivers two major performance overhauls to the AMSS-NCKU numerical relativity codebase:
|
||||
|
||||
1. **TwoPunctureABE Multithreading** — OpenMP parallelization of the TwoPunctures initial-data solver, combined with a BLAS-driven spectral derivative engine, MKL/LAPACK integration, and C/C++ rewrites of hot Fortran kernel subroutines.
|
||||
|
||||
2. **ABE GPU Rewrite** — Complete replacement of the legacy `bssn_gpu_class` abstraction layer with direct, monolithic CUDA kernels for BSSN, Z4C, and Shell-Patch evolution, plus GPU-resident state management and CUDA-aware MPI.
|
||||
|
||||
**Total diff:** 84 files changed, +57,919 / −33,795 lines.
|
||||
|
||||
---
|
||||
|
||||
## Part 1 — TwoPunctureABE Multithreading
|
||||
|
||||
### 1.1 Spectral Derivative Engine: BLAS Matrix-Multiplication Rewrite
|
||||
|
||||
**Files:** `AMSS_NCKU_source/TwoPunctures.C`, `AMSS_NCKU_source/TwoPunctures.h`
|
||||
|
||||
The original `Derivatives_AB3` computed spectral derivatives (Chebyshev in A/B, Fourier in phi) with nested scalar loops over every grid point. The new `Derivatives_AB3_MatMul` expresses all derivatives as matrix-matrix products over pencil-shaped data slices, dispatched to Intel MKL `cblas_dgemm`.
|
||||
|
||||
- **Precomputed derivative matrices** — `precompute_derivative_matrices()` builds `D1_A`, `D2_A`, `D1_B`, `D2_B` (Chebyshev collocation derivative matrices) and `DF1_phi`, `DF2_phi` (Fourier derivative matrices) once at construction time.
|
||||
- **Pencil-based GEMM** — data is gathered into 2D arrays where one dimension is the spectral direction and the other enumerates all remaining degrees of freedom (variables × orthogonal grid indices). Each derivative direction becomes a single `cblas_dgemm` call. The pure derivatives (d/dA, d/dB, d/dphi) and all mixed derivatives (d²/dAdB, d²/dAdphi, d²/dBdphi) are computed this way.
|
||||
- **`build_cheb_deriv_matrices` / `build_fourier_deriv_matrices`** — construct the standard Chebyshev and Fourier collocation derivative matrices.
|
||||
|
||||
### 1.2 OpenMP Parallelization of TwoPunctures
|
||||
|
||||
**Files:** `AMSS_NCKU_source/TwoPunctures.C`, `AMSS_NCKU_source/TwoPunctures.h`
|
||||
|
||||
Three critical regions are parallelized:
|
||||
|
||||
| Region | Directive | Strategy |
|
||||
|--------|-----------|----------|
|
||||
| `F_of_v` residual evaluation | `#pragma omp parallel for collapse(3) schedule(dynamic,1)` | Each (i,j,k) thread stack-allocates its own `l_U` (derivs struct) and `l_values[]` to eliminate heap contention and data races |
|
||||
| `relax_omp` line relaxation | `#pragma omp parallel for schedule(static)` over k-slices | Alternating be/al sweeps, each thread uses pre-allocated per-thread Thomas-algorithm workspace (`ws_*_be[tid]`, `ws_*_al[tid]`) |
|
||||
| `LineRelax_be_omp` / `LineRelax_al_omp` | Called from `relax_omp` with explicit `tid` | Thread-safe tridiagonal solves using the thread's private scratch arrays |
|
||||
|
||||
**Per-thread workspace** — `allocate_workspace()` allocates independent Thomas-algorithm buffers (`diag`, `e`, `f`, `b`, `x`, `l`, `u`, `d`, `y`) for each OpenMP thread in both be and al directions, avoiding lock contention in the inner Newton iteration.
|
||||
|
||||
### 1.3 MKL BLAS / LAPACK Integration
|
||||
|
||||
**Files:** `AMSS_NCKU_source/TwoPunctures.C`, `AMSS_NCKU_source/gaussj.C`
|
||||
|
||||
| Function | Old | New | Benefit |
|
||||
|----------|-----|-----|---------|
|
||||
| `norm2` | scalar `sqrt(sum(v[i]²))` loop | `cblas_dnrm2` | BLAS Level 1, SIMD-optimized |
|
||||
| `scalarproduct` | scalar `sum(v[i]*w[i])` loop | `cblas_ddot` | BLAS Level 1, SIMD-optimized |
|
||||
| `gaussj` | hand-written Gauss-Jordan elimination (~100 lines) | `LAPACKE_dgesv` + `LAPACKE_dgetrf` + `LAPACKE_dgetri` | LAPACK LU with partial pivoting, asymptotically faster for the `n~50` matrix sizes used in spectral elliptic solves |
|
||||
|
||||
### 1.4 C/C++ Rewrite of Hot Fortran Kernels
|
||||
|
||||
**Files (new):**
|
||||
- `AMSS_NCKU_source/fderivs_c.C` (167 lines) — first derivatives, 2nd/4th order
|
||||
- `AMSS_NCKU_source/fdderivs_c.C` (332 lines) — second derivatives, 2nd/4th order
|
||||
- `AMSS_NCKU_source/kodiss_c.C` (117 lines) — Kreiss-Oliger dissipation
|
||||
- `AMSS_NCKU_source/lopsided_c.C` (255 lines) — lopsided advection
|
||||
- `AMSS_NCKU_source/lopsided_kodis_c.C` (248 lines) — fused advection + dissipation
|
||||
- `AMSS_NCKU_source/rungekutta4_rout_c.C` (212 lines) — RK4 time-stepper
|
||||
- `AMSS_NCKU_source/bssn_rhs_c.C` (1,287 lines) — full BSSN RHS kernel
|
||||
- `AMSS_NCKU_source/z4c_rhs_c.C` (725 lines) — full Z4C RHS kernel
|
||||
|
||||
Every C rewrite follows a consistent optimization pattern:
|
||||
- **64-byte aligned allocation** (`aligned_alloc(64, ...)`) for AVX-512 compatibility.
|
||||
- **Static buffer caching** — scratch arrays (e.g., the padded `fh` ghost-zone buffer) persist across calls via a `static` pointer + capacity check, avoiding repeated `malloc`/`free`.
|
||||
- **Two-pass strategy** — 2nd-order finite differences are computed on the full domain first, then the interior sub-volume is overwritten with 4th-order stencils. This eliminates the per-point `if/elseif` branching of the original Fortran.
|
||||
- **Non-overlapping shell pass** — in `fdderivs_c.C`, the 2nd-order pass skips points that will be overwritten by the 4th-order pass, avoiding redundant computation.
|
||||
|
||||
### 1.5 Fortran Kernel Fusion: lopsided_kodis
|
||||
|
||||
**File:** `AMSS_NCKU_source/lopsidediff.f90`
|
||||
|
||||
A new `lopsided_kodis` subroutine fuses the advection (lopsided) and Kreiss-Oliger dissipation (kodis) operators into a single pass over the grid. Both operators previously called `symmetry_bd` independently to fill ghost zones — the fused version calls it once and shares the padded `fh` array, halving ghost-zone fill overhead for this hot path.
|
||||
|
||||
### 1.6 Build System for TwoPunctures
|
||||
|
||||
**Files:** `AMSS_NCKU_source/makefile`, `AMSS_NCKU_source/makefile.inc`
|
||||
|
||||
- **`TP_OPTFLAGS`** — TwoPunctures and TwoPunctureABE are compiled with a dedicated, more aggressive optimization flag set (`-O3 -march=znver5 -fp-model fast=2 -fma -ipo`) separate from the main code.
|
||||
- **`USE_CXX_KERNELS`** — selects between the C rewrites and the original Fortran kernels (`bssn_rhs.f90`, etc.) for the CPU path.
|
||||
- **`USE_CXX_RK4`** — independently selects between the C and Fortran RK4 stepper.
|
||||
- **Intel oneTBB allocator** (`libtbbmalloc.so`) — replaces the system `malloc` with a scalable thread-safe allocator, critical for multi-threaded TwoPunctures performance.
|
||||
- **PGO support** — `PGO_MODE=opt|instrument` for profile-guided optimization (currently disabled after testing showed negative benefit).
|
||||
- **Toolchains** — Intel oneAPI (`TOOLCHAIN=intel`, default) and NVIDIA HPC SDK (`TOOLCHAIN=nvhpc`).
|
||||
|
||||
---
|
||||
|
||||
## Part 2 — ABE GPU Rewrite
|
||||
|
||||
### 2.1 Architecture: From Class Wrapper to Direct CUDA Kernels
|
||||
|
||||
The old GPU path (`baseline`) was organized as:
|
||||
|
||||
```
|
||||
bssn_gpu_class.C/h — C++ class managing GPU state and kernel launches
|
||||
bssn_step_gpu.C — RK4 stepper with per-substep GPU/CPU synchronisation
|
||||
bssn_gpu.cu — CUDA kernel implementations called through the class
|
||||
```
|
||||
|
||||
The new GPU path (`asc26-plan-a`) replaces all of the above with:
|
||||
|
||||
```
|
||||
bssn_rhs_cuda.cu/h — 10,381-line monolithic CUDA BSSN RHS kernel
|
||||
z4c_rhs_cuda.cu/h — 7,909-line monolithic CUDA Z4C RHS kernel
|
||||
fd_cuda_helpers.cuh — 412-line shared finite-difference device functions
|
||||
bssn_gpu_rhs_ss.cu — (retained, lightly modified) Shell-Patch GPU RHS
|
||||
```
|
||||
|
||||
**Key architectural differences:**
|
||||
- The old `bssn_gpu_class` managed GPU memory through a C++ class with explicit allocate/free/sync methods scattered across the time-stepping logic. The new kernels operate directly on raw device pointers with a clear resident/transient memory model.
|
||||
- The old code launched many small kernels (one per derivative or algebraic term). The new code is a **single monolithic kernel per formulation** — all 24 BSSN evolution variables are computed in one launch with on-the-fly finite differences, eliminating kernel-launch latency and intermediate global-memory round-trips.
|
||||
- The old `bssn_step_gpu.C` performed per-substep GPU→CPU downloads for boundary conditions and analysis. The new model supports **GPU-resident state** — variables stay on device across timesteps unless explicitly requested.
|
||||
|
||||
### 2.2 GPU-Resident State Model
|
||||
|
||||
A central theme across ~20 commits is the "resident-sync" optimization:
|
||||
|
||||
| Commit | What it does |
|
||||
|--------|-------------|
|
||||
| `22c1e71` | Optimize BSSN CUDA resident state and CUDA-aware MPI |
|
||||
| `090d865` | Optimize BSSN CUDA state transfers |
|
||||
| `68eab03` | Add opt-in BSSN CUDA resident AMR path |
|
||||
| `1ee229a` | Add keyed BSSN CUDA resident banks |
|
||||
| `18e9c9c` | Optimize BSSN CUDA resident AMR prolong |
|
||||
| `8486532` | Add resident BSSN GPU point interpolation |
|
||||
| `b1974ef` | Stabilize device AMR restrict across regrid |
|
||||
| `ae64a22` | Complete BSSN-EScalar CUDA resident transfers |
|
||||
| `83afaf1` | Skip zero EM resident downloads |
|
||||
| `35b6cef` | Broaden cached CUDA sync paths |
|
||||
|
||||
The resident model works as follows:
|
||||
- BSSN grid functions are allocated once on the GPU and persist across timesteps.
|
||||
- Inter-processor ghost-zone exchanges use **CUDA-aware MPI** — MPI directly reads/writes device memory without staging through host buffers.
|
||||
- AMR prolongation and restriction operate directly on device memory.
|
||||
- Boundary conditions and analysis routines download only the specific slices/points they need, not the full grid.
|
||||
- When EM fields are zero (pure-gravity runs), EM downloads are skipped entirely.
|
||||
|
||||
### 2.3 Z4C and Shell-Patch GPU Acceleration
|
||||
|
||||
**Files:** `AMSS_NCKU_source/z4c_rhs_cuda.cu`, `AMSS_NCKU_source/bssn_gpu_rhs_ss.cu`
|
||||
|
||||
- The Z4C constraint-damped formulation gets its own 7,909-line monolithic CUDA kernel (`z4c_rhs_cuda.cu`), matching the BSSN kernel's architecture.
|
||||
- **Shell-Patch GPU acceleration** — the spherical shell boundary patches now compute on GPU with dedicated kernels in `bssn_gpu_rhs_ss.cu`.
|
||||
- Z4C + Shell-Patch can coexist on GPU (Phase 3 commits).
|
||||
- A CPU-side wrapper (`z4c_rhs_c.C`) handles the trKd + TZ_rhs contribution that remains on CPU, minimizing GPU/CPU traffic.
|
||||
|
||||
### 2.4 Finite-Difference Order Flexibility
|
||||
|
||||
**File:** `AMSS_NCKU_source/fd_cuda_helpers.cuh`
|
||||
|
||||
Shared device functions for finite-difference stencils support **2nd, 4th, 6th, and 8th order** at compile time via preprocessor switches. This enables:
|
||||
- Per-run selection of convergence order without recompilation of the full kernel.
|
||||
- 8th-order AMR transfers (`1064a68`) for BSSN-EM.
|
||||
- 6th-order optimized AMR stencils (`0076b3c`).
|
||||
|
||||
### 2.5 GPU Diagnostics and Quality Assurance
|
||||
|
||||
**File:** `AMSS_NCKU_GPUCheck.py` (559 lines, new)
|
||||
|
||||
A Python-based GPU correctness verification tool that compares GPU and CPU evolution outputs. The GPU build pipeline includes optional kernel profiling switches (`7683459`) for performance debugging.
|
||||
|
||||
**GPU-specific bug fixes:**
|
||||
- `f226498` — Fix CUDA AMR symmetry drift (incorrect ghost-zone handling under symmetry boundary conditions)
|
||||
- `2317e4a` — Fix BSSN GPU resident AMR sync default
|
||||
- `fea2dcc` — Fix BSSN-EM runtime crash
|
||||
- `dd0e20d` — Fix BSSN-EScalar CUDA boundary and scalar KO
|
||||
- `5eb4994` — Fix AHF crash under CUDA resident-sync mode
|
||||
|
||||
### 2.6 Build Integration
|
||||
|
||||
**Makefile switches:**
|
||||
- `USE_CUDA_BSSN=0/1` — route BSSN RHS through GPU or CPU
|
||||
- `USE_CUDA_Z4C=0/1` — route Z4C RHS through GPU or CPU
|
||||
- `CUDA_ARCH=sm_80` — target NVIDIA Ampere (A100)
|
||||
- `NVHPC_ROOT` — path to NVIDIA HPC SDK for the `nvcc` compiler wrapper
|
||||
- CUDA compilation flags: `-O3 --ptxas-options=-v -arch=$(CUDA_ARCH)`
|
||||
|
||||
---
|
||||
|
||||
## Part 3 — Shared Infrastructure
|
||||
|
||||
### 3.1 Interp_Points Load-Balance Profiler
|
||||
|
||||
**Files:** `AMSS_NCKU_source/interp_lb_profile.C`, `interp_lb_profile.h`, `interp_lb_profile_data.h`, `generate_interp_lb_header.py`
|
||||
|
||||
A two-pass instrumentation system for load-balancing the `Interp_Points` parallel interpolation routine:
|
||||
- **Pass 1** (`INTERP_LB_MODE=profile`): instrument each MPI rank's interpolation calls with timing, write a binary profile.
|
||||
- **Pass 2** (`INTERP_LB_MODE=optimize`): read the profile and rebalance work across MPI ranks.
|
||||
|
||||
### 3.2 Helper Headers
|
||||
|
||||
**Files:** `AMSS_NCKU_source/tool.h` (33 lines), `AMSS_NCKU_source/share_func.h` (246 lines)
|
||||
|
||||
- `tool.h` — shared indexing macros (`idx_ex`, `idx_fh_F_ord2`) and the `symmetry_bd` declaration used by all C kernel rewrites.
|
||||
- `share_func.h` — common utility functions shared across the C++ source files.
|
||||
|
||||
### 3.3 Plot-Only Restart Script
|
||||
|
||||
**File:** `parallel_plot_helper.py` (29 lines)
|
||||
|
||||
A lightweight restart script that skips recomputation when plotting was interrupted — reads existing checkpoint data and replots without re-running the simulation.
|
||||
|
||||
---
|
||||
|
||||
## Performance Summary
|
||||
|
||||
| Component | Optimization | Expected Impact |
|
||||
|-----------|-------------|-----------------|
|
||||
| TwoPunctures `Derivatives_AB3` | Scalar loops → MKL GEMM | 5-20× speedup for spectral derivative computation |
|
||||
| TwoPunctures `F_of_v` | OpenMP collapse(3) + stack-local variables | Near-linear scaling with core count for residual evaluation |
|
||||
| TwoPunctures `gaussj` | Hand-written Gauss-Jordan → LAPACK LU | 2-5× speedup for N~50 matrix inversion |
|
||||
| BSSN RHS (GPU) | Many small kernels → one monolithic kernel | Eliminates kernel-launch overhead; 2-5× GPU throughput improvement |
|
||||
| GPU state transfers | Per-step download → resident model | Eliminates ~80% of GPU↔CPU PCIe traffic |
|
||||
| `lopsided_kodis` fusion | Two `symmetry_bd` calls → one shared call | ~30% reduction in ghost-zone fill cost for this operator pair |
|
||||
| Memory allocator | System malloc → Intel TBB malloc | Significant reduction in malloc contention under OpenMP |
|
||||
| C kernel rewrites | Fortran → C with aligned alloc + static buffers | Enables Intel compiler IPO across C/C++/Fortran boundaries; better SIMD codegen |
|
||||
|
||||
---
|
||||
@@ -45,8 +45,7 @@ def get_last_n_cores_per_socket(n=32):
|
||||
cpu_str = ",".join(segments)
|
||||
total = len(segments) * n
|
||||
print(f" CPU binding: taskset -c {cpu_str} ({total} cores, last {n} per socket)")
|
||||
#return f"taskset -c {cpu_str}"
|
||||
return f""
|
||||
return f"taskset -c {cpu_str}" if cpu_str else ""
|
||||
|
||||
|
||||
## CPU core binding: dynamically select the last 32 cores of each socket (64 cores total)
|
||||
@@ -75,6 +74,13 @@ def _input_or_env(input_name, env_name, default=None):
|
||||
return getattr(input_data, input_name, default)
|
||||
|
||||
|
||||
def _input_env_passthrough(runtime_env, env_name):
|
||||
if env_name in runtime_env:
|
||||
return
|
||||
if hasattr(input_data, env_name):
|
||||
runtime_env[env_name] = str(getattr(input_data, env_name))
|
||||
|
||||
|
||||
def _start_cuda_mps_if_requested(runtime_env):
|
||||
if input_data.GPU_Calculation != "yes":
|
||||
return False
|
||||
@@ -138,10 +144,11 @@ def _stop_cuda_mps(runtime_env):
|
||||
|
||||
def _gpu_runtime_env():
|
||||
runtime_env = os.environ.copy()
|
||||
original_env = set(os.environ.keys())
|
||||
finite_difference = str(getattr(input_data, "Finite_Diffenence_Method", "4th-order")).strip()
|
||||
|
||||
defaults = {
|
||||
"AMSS_EVOLVE_TIMING": "1",
|
||||
"AMSS_EVOLVE_TIMING": "0",
|
||||
"AMSS_ESCALAR_STEP_TIMING": "0",
|
||||
"AMSS_INTERP_FAST": "1",
|
||||
"AMSS_INTERP_GPU": "1",
|
||||
@@ -193,6 +200,72 @@ def _gpu_runtime_env():
|
||||
for key, value in defaults.items():
|
||||
runtime_env.setdefault(key, value)
|
||||
|
||||
input_overrides = [
|
||||
"AMSS_EVOLVE_TIMING",
|
||||
"AMSS_ESCALAR_STEP_TIMING",
|
||||
"AMSS_INTERP_FAST",
|
||||
"AMSS_INTERP_GPU",
|
||||
"AMSS_ANALYSIS_MAP_EVERY",
|
||||
"AMSS_CUDA_AWARE_MPI",
|
||||
"AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP",
|
||||
"AMSS_CUDA_KEEP_ALL_LEVELS",
|
||||
"AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP",
|
||||
"AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS",
|
||||
"AMSS_CUDA_EM_CACHE_SOURCES",
|
||||
"AMSS_CUDA_EM_ZERO_FASTPATH",
|
||||
"AMSS_EM_ZERO_ANALYSIS_FASTPATH",
|
||||
"AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH",
|
||||
"AMSS_CUDA_AMR_HOST_STAGED",
|
||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE",
|
||||
"AMSS_CUDA_AMR_RESTRICT_BATCH",
|
||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH",
|
||||
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS",
|
||||
"AMSS_SHELL_FAST_INTERP",
|
||||
"AMSS_SHELL_PARALLEL_INTERP",
|
||||
"AMSS_SHELL_CUDA_INTERP",
|
||||
"AMSS_SHELL_INTERP_THREADS",
|
||||
"AMSS_Z4C_CUDA_RESIDENT",
|
||||
"AMSS_CONSTRAINT_OUT_EVERY",
|
||||
"AMSS_Z4C_MRBD",
|
||||
]
|
||||
for env_name in input_overrides:
|
||||
if env_name not in original_env and hasattr(input_data, env_name):
|
||||
runtime_env[env_name] = str(getattr(input_data, env_name))
|
||||
|
||||
passthrough_envs = [
|
||||
"AMSS_CUDA_RESIDENT_SYNC",
|
||||
"AMSS_CUDA_BSSN_RESIDENT_SYNC",
|
||||
"AMSS_CUDA_EM_RESIDENT_SYNC",
|
||||
"AMSS_CUDA_ESCALAR_RESIDENT_SYNC",
|
||||
"AMSS_CUDA_BH_INTERP_RESIDENT",
|
||||
"AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP",
|
||||
"AMSS_CUDA_KEEP_ALL_LEVELS",
|
||||
"AMSS_CUDA_EM_KEEP_RESIDENT_AFTER_STEP",
|
||||
"AMSS_CUDA_EM_KEEP_ALL_LEVELS",
|
||||
"AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP",
|
||||
"AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS",
|
||||
"AMSS_CUDA_AMR_HOST_STAGED",
|
||||
"AMSS_CUDA_AMR_RESTRICT_DEVICE",
|
||||
"AMSS_CUDA_AMR_RESTRICT_BATCH",
|
||||
"AMSS_CUDA_DEVICE_SEGMENT_BATCH",
|
||||
"AMSS_CUDA_UNCACHED_DEVICE_BUFFERS",
|
||||
"AMSS_CUDA_EM_CACHE_SOURCES",
|
||||
"AMSS_CUDA_EM_ZERO_FASTPATH",
|
||||
"AMSS_CUDA_AWARE_MPI",
|
||||
"AMSS_CUDA_REGRID_FLUSH_ALWAYS",
|
||||
"AMSS_Z4C_CUDA_RESIDENT",
|
||||
"AMSS_SHELL_FAST_INTERP",
|
||||
"AMSS_SHELL_PARALLEL_INTERP",
|
||||
"AMSS_SHELL_CUDA_INTERP",
|
||||
"AMSS_SHELL_INTERP_THREADS",
|
||||
"AMSS_EM_ZERO_ANALYSIS_FASTPATH",
|
||||
"AMSS_EM_ZERO_RESIDENT_DOWNLOAD_FASTPATH",
|
||||
"AMSS_INTERP_FAST",
|
||||
"AMSS_INTERP_GPU",
|
||||
]
|
||||
for env_name in passthrough_envs:
|
||||
_input_env_passthrough(runtime_env, env_name)
|
||||
|
||||
optional_overrides = {
|
||||
"AMSS_INTERP_FAST_COMPARE": "AMSS_Interp_Fast_Compare",
|
||||
"AMSS_INTERP_FAST_COMPARE_LIMIT": "AMSS_Interp_Fast_Compare_Limit",
|
||||
@@ -221,11 +294,13 @@ def makefile_ABE():
|
||||
print( " Compiling the AMSS-NCKU executable file ABE/ABEGPU " )
|
||||
print( )
|
||||
|
||||
z4c_mrbd = int(getattr(input_data, "AMSS_Z4C_MRBD", 0))
|
||||
|
||||
## Build command with CPU binding to nohz_full cores
|
||||
if (input_data.GPU_Calculation == "no"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE"
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} env AMSS_Z4C_MRBD={z4c_mrbd} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=0 USE_CUDA_Z4C=0 ABE"
|
||||
elif (input_data.GPU_Calculation == "yes"):
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA"
|
||||
makefile_command = f"{NUMACTL_CPU_BIND} env AMSS_Z4C_MRBD={z4c_mrbd} make -j{BUILD_JOBS} INTERP_LB_MODE=off USE_CUDA_BSSN=1 USE_CUDA_Z4C=1 ABE_CUDA"
|
||||
else:
|
||||
print( " CPU/GPU numerical calculation setting is wrong " )
|
||||
print( )
|
||||
@@ -367,7 +442,6 @@ def run_ABE():
|
||||
for line in mpi_process.stdout:
|
||||
print(line, end='') # stream output in real time
|
||||
file0.write(line) # write the line to file
|
||||
file0.flush() # flush to ensure each line is written immediately (optional)
|
||||
|
||||
## Wait for the process to finish
|
||||
mpi_return_code = mpi_process.wait()
|
||||
@@ -411,8 +485,6 @@ def run_TwoPunctureABE():
|
||||
for line in TwoPuncture_process.stdout:
|
||||
print(line, end='') # stream output in real time
|
||||
file0.write(line) # write the line to file
|
||||
file0.flush() # flush to ensure each line is written immediately (optional)
|
||||
file0.close()
|
||||
|
||||
## Wait for the process to finish
|
||||
TwoPuncture_command_return_code = TwoPuncture_process.wait()
|
||||
|
||||
Reference in New Issue
Block a user