#!/bin/bash # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. export CXX_PATH=/usr export PATH=${CXX_PATH}/bin:${PATH} if [[ -z "${MPI_PATH}" ]]; then export MPI_PATH=/path/to/mpi #Change this to correct MPI path fi if [[ -z "${CUDA_PATH}" ]]; then export MATHLIBS_PATH=/path/to/mathlibs #Change this to correct CUDA mathlibs fi if [[ -z "${NCCL_PATH}" ]]; then export NCCL_PATH=/path/to/nccl #Change to correct NCCL path fi if [[ -z "${CUDA_PATH}" ]]; then export CUDA_PATH=/path/to/cuda #Change this to correct CUDA path fi if [[ -z "${NVPL_SPARSE}" ]]; then export NVPL_SPARSE=/path/to/nvpllibs #Change this to correct NVPL mathlibs fi #Please fix, if needed export CUDA_BLAS_VERSION=${CUDA_BUILD_VERSION:-12.2} export LD_LIBRARY_PATH=${MATHLIBS_PATH}/${CUDA_BLAS_VERSION}/lib64/:${LD_LIBRARY_PATH} export PATH=${CUDA_PATH}/bin:${PATH} export LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${LD_LIBRARY_PATH} export LD_LIBRARY_PATH=${NCCL_PATH}/lib:${LD_LIBRARY_PATH} export LD_LIBRARY_PATH=${NVPL_SPARSE}/lib:${LD_LIBRARY_PATH} ext="--mca pml ^ucx --mca btl ^openib,smcuda -mca coll_hcoll_enable 0 -x coll_hcoll_np=0 --bind-to none" #Directory to xhpcg binary dir="bin/" #Sample on a Hopper GPU x86 ########################### #Local problem size nx=512 #Large problem size x ny=512 #Large problem size y nz=288 #Large problem size z mpirun --oversubscribe ${ext} -np 1 ${dir}/hpcg.sh --exec-name ${dir}/xhpcg \ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 ######################################################################################## #Sample on Grace Hopper x4 ########################### #Local problem size nx=256 #Large problem size x, assumed for the GPU ny=1024 #Large problem size y, assumed for the GPU nz=288 #Large problem size z, assumed for the GPU #1 GPUOnly #---------# np=4 #Total number of ranks mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh --exec-name ${dir}/xhpcg \ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \ --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287 #2 GraceOnly #-----------# np=4 #Total number of ranks mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh --exec-name ${dir}/xhpcg-cpu \ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --exm 0 --p2p 0 \ --mem-affinity 0:1:2:3 --cpu-affinity 0-71:72-143:144-215:216-287 #3 Hetrogeneous (GPU + Grace) #----------------------------# np=8 #Total number of ranks (4GPU + 4Grace) exm=2 #Execution mode GPU+Grace diff_dim=2 #different dim between GPU and Grace is Y lpm=1 #Local problem mode (nx/ny/nz are local to GPU, g2c is the Grace different dimension) g2c=64 #Based on dif_dim=2 and lpm=1 --> Grace rank local problem size is $nx x $g2c x $nz #3D grid size 4x2x1 (must be equal to np) npx=4 #number of ranks in the x direction npy=2 #number of ranks in the y direction npz=1 #number of ranks in the z direction mpirun --oversubscribe ${ext} -np $np ${dir}/hpcg-aarch64.sh --exec-name ${dir}/xhpcg \ --nx $nx --ny $ny --nz $nz --rt 10 --b 0 --p2p 0 --exm $exm --lpm $lpm --g2c $g2c --ddm $diff_dim --npx $npx --npy $npy --npz $npz \ --mem-affinity 0:0:1:1:2:2:3:3 --cpu-affinity 0-7:8-71:72-79:80-143:144-151:152-215:216-223:224-287