Enable optimized GPU runs from Python launcher

This commit is contained in:
2026-04-30 18:31:31 +08:00
parent da4d56ccf7
commit e0d0673c8e

View File

@@ -9,6 +9,8 @@
import AMSS_NCKU_Input as input_data
import os
import shutil
import subprocess
import time
@@ -56,6 +58,111 @@ BUILD_JOBS = 64
##################################################################
def _truthy(value, default=False):
if value is None:
return default
if isinstance(value, bool):
return value
text = str(value).strip().lower()
if text == "":
return default
return text in ("1", "yes", "y", "true", "on", "enable", "enabled")
def _input_or_env(input_name, env_name, default=None):
if env_name in os.environ:
return os.environ[env_name]
return getattr(input_data, input_name, default)
def _start_cuda_mps_if_requested(runtime_env):
if input_data.GPU_Calculation != "yes":
return False
default_auto_mps = int(getattr(input_data, "MPI_processes", 1)) > 1
auto_mps = _truthy(
_input_or_env("CUDA_Auto_MPS", "AMSS_CUDA_AUTO_MPS", default_auto_mps),
default=default_auto_mps,
)
if not auto_mps:
return False
mps_control = shutil.which("nvidia-cuda-mps-control")
if not mps_control:
print(" CUDA MPS control command was not found; running without MPS.")
return False
uid = os.getuid()
pipe_dir = str(_input_or_env("CUDA_MPS_PIPE_DIRECTORY", "CUDA_MPS_PIPE_DIRECTORY",
f"/tmp/amss-ncku-mps-{uid}"))
log_dir = str(_input_or_env("CUDA_MPS_LOG_DIRECTORY", "CUDA_MPS_LOG_DIRECTORY",
f"/tmp/amss-ncku-mps-log-{uid}"))
os.makedirs(pipe_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)
mps_env = runtime_env.copy()
mps_env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir
mps_env["CUDA_MPS_LOG_DIRECTORY"] = log_dir
if os.path.exists(os.path.join(pipe_dir, "control")):
runtime_env.update({
"CUDA_MPS_PIPE_DIRECTORY": pipe_dir,
"CUDA_MPS_LOG_DIRECTORY": log_dir,
})
print(f" Reusing CUDA MPS daemon: {pipe_dir}")
return False
print(f" Starting CUDA MPS daemon for this run: {pipe_dir}")
result = subprocess.run([mps_control, "-d"], env=mps_env, text=True,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
if result.returncode != 0:
print(" CUDA MPS daemon did not start; running without MPS.")
if result.stdout:
print(result.stdout, end="")
return False
runtime_env.update({
"CUDA_MPS_PIPE_DIRECTORY": pipe_dir,
"CUDA_MPS_LOG_DIRECTORY": log_dir,
})
return True
def _stop_cuda_mps(runtime_env):
mps_control = shutil.which("nvidia-cuda-mps-control")
if not mps_control:
return
subprocess.run([mps_control], input="quit\n", env=runtime_env, text=True,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
def _gpu_runtime_env():
runtime_env = os.environ.copy()
defaults = {
"AMSS_INTERP_FAST": "1",
"AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1",
"AMSS_CUDA_KEEP_ALL_LEVELS": "1",
}
for key, value in defaults.items():
runtime_env.setdefault(key, value)
optional_overrides = {
"AMSS_INTERP_FAST_COMPARE": "AMSS_Interp_Fast_Compare",
"AMSS_INTERP_FAST_COMPARE_LIMIT": "AMSS_Interp_Fast_Compare_Limit",
"AMSS_INTERP_FAST_COMPARE_TOL": "AMSS_Interp_Fast_Compare_Tol",
"AMSS_GPU_STAGE_TIMING": "AMSS_GPU_Stage_Timing",
"AMSS_GPU_STAGE_TIMING_EVERY": "AMSS_GPU_Stage_Timing_Every",
}
for env_name, input_name in optional_overrides.items():
if env_name not in runtime_env and hasattr(input_data, input_name):
runtime_env[env_name] = str(getattr(input_data, input_name))
return runtime_env
##################################################################
##################################################################
@@ -145,6 +252,8 @@ def run_ABE():
print( )
## Define the command to run; cast other values to strings as needed
mpi_env = None
started_mps = False
if (input_data.GPU_Calculation == "no"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
@@ -153,21 +262,35 @@ def run_ABE():
elif (input_data.GPU_Calculation == "yes"):
mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA"
mpi_command_outfile = "ABEGPU_out.log"
mpi_env = _gpu_runtime_env()
started_mps = _start_cuda_mps_if_requested(mpi_env)
print(" GPU optimized runtime switches:")
print(f" AMSS_INTERP_FAST={mpi_env.get('AMSS_INTERP_FAST', '')}")
print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}")
print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env:
print(f" CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}")
## Execute the MPI command and stream output
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
try:
## Execute the MPI command and stream output
mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, text=True, env=mpi_env)
## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0:
## Read and print output lines; also write each line to file
for line in mpi_process.stdout:
print(line, end='') # stream output in real time
file0.write(line) # write the line to file
file0.flush() # flush to ensure each line is written immediately (optional)
file0.close()
## Write ABE run output to file while printing to stdout
with open(mpi_command_outfile, 'w') as file0:
## Read and print output lines; also write each line to file
for line in mpi_process.stdout:
print(line, end='') # stream output in real time
file0.write(line) # write the line to file
file0.flush() # flush to ensure each line is written immediately (optional)
## Wait for the process to finish
mpi_return_code = mpi_process.wait()
## Wait for the process to finish
mpi_return_code = mpi_process.wait()
if mpi_return_code != 0:
raise subprocess.CalledProcessError(mpi_return_code, mpi_command)
finally:
if started_mps:
_stop_cuda_mps(mpi_env)
print( )
print( " The ABE/ABEGPU simulation is finished " )