diff --git a/makefile_and_run.py b/makefile_and_run.py index 4f7c377..fde17a9 100755 --- a/makefile_and_run.py +++ b/makefile_and_run.py @@ -9,6 +9,8 @@ import AMSS_NCKU_Input as input_data +import os +import shutil import subprocess import time @@ -56,6 +58,111 @@ BUILD_JOBS = 64 ################################################################## +def _truthy(value, default=False): + if value is None: + return default + if isinstance(value, bool): + return value + text = str(value).strip().lower() + if text == "": + return default + return text in ("1", "yes", "y", "true", "on", "enable", "enabled") + + +def _input_or_env(input_name, env_name, default=None): + if env_name in os.environ: + return os.environ[env_name] + return getattr(input_data, input_name, default) + + +def _start_cuda_mps_if_requested(runtime_env): + if input_data.GPU_Calculation != "yes": + return False + + default_auto_mps = int(getattr(input_data, "MPI_processes", 1)) > 1 + auto_mps = _truthy( + _input_or_env("CUDA_Auto_MPS", "AMSS_CUDA_AUTO_MPS", default_auto_mps), + default=default_auto_mps, + ) + if not auto_mps: + return False + + mps_control = shutil.which("nvidia-cuda-mps-control") + if not mps_control: + print(" CUDA MPS control command was not found; running without MPS.") + return False + + uid = os.getuid() + pipe_dir = str(_input_or_env("CUDA_MPS_PIPE_DIRECTORY", "CUDA_MPS_PIPE_DIRECTORY", + f"/tmp/amss-ncku-mps-{uid}")) + log_dir = str(_input_or_env("CUDA_MPS_LOG_DIRECTORY", "CUDA_MPS_LOG_DIRECTORY", + f"/tmp/amss-ncku-mps-log-{uid}")) + os.makedirs(pipe_dir, exist_ok=True) + os.makedirs(log_dir, exist_ok=True) + + mps_env = runtime_env.copy() + mps_env["CUDA_MPS_PIPE_DIRECTORY"] = pipe_dir + mps_env["CUDA_MPS_LOG_DIRECTORY"] = log_dir + + if os.path.exists(os.path.join(pipe_dir, "control")): + runtime_env.update({ + "CUDA_MPS_PIPE_DIRECTORY": pipe_dir, + "CUDA_MPS_LOG_DIRECTORY": log_dir, + }) + print(f" Reusing CUDA MPS daemon: {pipe_dir}") + return False + + print(f" Starting CUDA MPS daemon for this run: {pipe_dir}") + result = subprocess.run([mps_control, "-d"], env=mps_env, text=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + if result.returncode != 0: + print(" CUDA MPS daemon did not start; running without MPS.") + if result.stdout: + print(result.stdout, end="") + return False + + runtime_env.update({ + "CUDA_MPS_PIPE_DIRECTORY": pipe_dir, + "CUDA_MPS_LOG_DIRECTORY": log_dir, + }) + return True + + +def _stop_cuda_mps(runtime_env): + mps_control = shutil.which("nvidia-cuda-mps-control") + if not mps_control: + return + subprocess.run([mps_control], input="quit\n", env=runtime_env, text=True, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + + +def _gpu_runtime_env(): + runtime_env = os.environ.copy() + + defaults = { + "AMSS_INTERP_FAST": "1", + "AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP": "1", + "AMSS_CUDA_KEEP_ALL_LEVELS": "1", + } + for key, value in defaults.items(): + runtime_env.setdefault(key, value) + + optional_overrides = { + "AMSS_INTERP_FAST_COMPARE": "AMSS_Interp_Fast_Compare", + "AMSS_INTERP_FAST_COMPARE_LIMIT": "AMSS_Interp_Fast_Compare_Limit", + "AMSS_INTERP_FAST_COMPARE_TOL": "AMSS_Interp_Fast_Compare_Tol", + "AMSS_GPU_STAGE_TIMING": "AMSS_GPU_Stage_Timing", + "AMSS_GPU_STAGE_TIMING_EVERY": "AMSS_GPU_Stage_Timing_Every", + } + for env_name, input_name in optional_overrides.items(): + if env_name not in runtime_env and hasattr(input_data, input_name): + runtime_env[env_name] = str(getattr(input_data, input_name)) + + return runtime_env + + +################################################################## + ################################################################## @@ -145,6 +252,8 @@ def run_ABE(): print( ) ## Define the command to run; cast other values to strings as needed + mpi_env = None + started_mps = False if (input_data.GPU_Calculation == "no"): mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE" @@ -153,21 +262,35 @@ def run_ABE(): elif (input_data.GPU_Calculation == "yes"): mpi_command = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE_CUDA" mpi_command_outfile = "ABEGPU_out.log" + mpi_env = _gpu_runtime_env() + started_mps = _start_cuda_mps_if_requested(mpi_env) + print(" GPU optimized runtime switches:") + print(f" AMSS_INTERP_FAST={mpi_env.get('AMSS_INTERP_FAST', '')}") + print(f" AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP', '')}") + print(f" AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}") + if "CUDA_MPS_PIPE_DIRECTORY" in mpi_env: + print(f" CUDA_MPS_PIPE_DIRECTORY={mpi_env['CUDA_MPS_PIPE_DIRECTORY']}") - ## Execute the MPI command and stream output - mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + try: + ## Execute the MPI command and stream output + mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, text=True, env=mpi_env) - ## Write ABE run output to file while printing to stdout - with open(mpi_command_outfile, 'w') as file0: - ## Read and print output lines; also write each line to file - for line in mpi_process.stdout: - print(line, end='') # stream output in real time - file0.write(line) # write the line to file - file0.flush() # flush to ensure each line is written immediately (optional) - file0.close() + ## Write ABE run output to file while printing to stdout + with open(mpi_command_outfile, 'w') as file0: + ## Read and print output lines; also write each line to file + for line in mpi_process.stdout: + print(line, end='') # stream output in real time + file0.write(line) # write the line to file + file0.flush() # flush to ensure each line is written immediately (optional) - ## Wait for the process to finish - mpi_return_code = mpi_process.wait() + ## Wait for the process to finish + mpi_return_code = mpi_process.wait() + if mpi_return_code != 0: + raise subprocess.CalledProcessError(mpi_return_code, mpi_command) + finally: + if started_mps: + _stop_cuda_mps(mpi_env) print( ) print( " The ABE/ABEGPU simulation is finished " )