Compare commits
16 Commits
aocc-legac
...
gcc-legacy
| Author | SHA1 | Date | |
|---|---|---|---|
| 12bf08a2a1 | |||
| 9b4f98e237 | |||
| 2bbde059db | |||
| 3b8774c1b1 | |||
| 23b52e30d6 | |||
| e8f590a742 | |||
| 632173ea10 | |||
| eed2ff2be8 | |||
| b904f6cf56 | |||
| c4b9bd3788 | |||
| 276b36ea25 | |||
| baf248c3bc | |||
| 70b6496ed3 | |||
| 6ca9fece2e | |||
| 516cdea502 | |||
| 9687d9a3dd |
@@ -58,10 +58,14 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
|||||||
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
||||||
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
||||||
|
|
||||||
## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
|
|
||||||
## make -> opt (PGO-guided, maximum performance)
|
|
||||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
|
||||||
|
|
||||||
|
## GCC build flags (optimized for x86-64-v4)
|
||||||
|
## PGO disabled (used negative optimization on Intel; not tested on GCC)
|
||||||
|
CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||||
|
-Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
|
||||||
|
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||||
|
f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||||
|
-cpp $(POLINT6_FLAG)
|
||||||
|
|
||||||
.SUFFIXES: .o .f90 .C .for .cu
|
.SUFFIXES: .o .f90 .C .for .cu
|
||||||
|
|
||||||
@@ -69,11 +73,11 @@ ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KE
|
|||||||
$(f90) $(f90appflags) -c $< -o $@
|
$(f90) $(f90appflags) -c $< -o $@
|
||||||
|
|
||||||
.C.o:
|
.C.o:
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
# ShellPatch.C uses OpenMP for setupintintstuff search loops
|
# ShellPatch.C uses OpenMP for setupintintstuff search loops
|
||||||
ShellPatch.o: ShellPatch.C
|
ShellPatch.o: ShellPatch.C
|
||||||
$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
||||||
|
|
||||||
.for.o:
|
.for.o:
|
||||||
$(f77) -c $< -o $@
|
$(f77) -c $< -o $@
|
||||||
@@ -83,59 +87,59 @@ ShellPatch.o: ShellPatch.C
|
|||||||
|
|
||||||
# C rewrite of BSSN RHS kernel and helpers
|
# C rewrite of BSSN RHS kernel and helpers
|
||||||
bssn_rhs_c.o: bssn_rhs_c.C
|
bssn_rhs_c.o: bssn_rhs_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fderivs_c.o: fderivs_c.C
|
fderivs_c.o: fderivs_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fdderivs_c.o: fdderivs_c.C
|
fdderivs_c.o: fdderivs_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
kodiss_c.o: kodiss_c.C
|
kodiss_c.o: kodiss_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
lopsided_c.o: lopsided_c.C
|
lopsided_c.o: lopsided_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
# C rewrite of shell-patch derivative kernels
|
# C rewrite of shell-patch derivative kernels
|
||||||
fderivs_sh_c.o: fderivs_sh_c.C
|
fderivs_sh_c.o: fderivs_sh_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fdderivs_sh_c.o: fdderivs_sh_c.C
|
fdderivs_sh_c.o: fdderivs_sh_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fderivs_shc_c.o: fderivs_shc_c.C
|
fderivs_shc_c.o: fderivs_shc_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
fdderivs_shc_c.o: fdderivs_shc_c.C
|
fdderivs_shc_c.o: fdderivs_shc_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
kodiss_sh_c.o: kodiss_sh_c.C
|
kodiss_sh_c.o: kodiss_sh_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
|
|
||||||
bssn_em_rhs_c.o: bssn_em_rhs_c.C
|
bssn_em_rhs_c.o: bssn_em_rhs_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
z4c_rhs_c.o: z4c_rhs_c.C
|
z4c_rhs_c.o: z4c_rhs_c.C
|
||||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||||
# $(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||||
|
|
||||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||||
-Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
|
-Dfortran3 -Dnewc $(filein_real)
|
||||||
|
|
||||||
TwoPunctures.o: TwoPunctures.C
|
TwoPunctures.o: TwoPunctures.C
|
||||||
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
TwoPunctureABE.o: TwoPunctureABE.C
|
TwoPunctureABE.o: TwoPunctureABE.C
|
||||||
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||||
|
|
||||||
# Input files
|
# Input files
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,26 @@
|
|||||||
## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
|
## GCC version with OpenMPI and OpenBLAS
|
||||||
|
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
|
||||||
|
|
||||||
## AOCL root path for includes and libraries
|
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
|
||||||
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
|
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
|
||||||
|
|
||||||
## AOCC-built OpenMPI prefix
|
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
|
||||||
OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10
|
|
||||||
|
|
||||||
filein = -I/usr/include/ -I$(AOCL_ROOT)/include
|
## OpenBLAS (OpenMP variant) + gfortran runtime
|
||||||
|
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
|
||||||
## Using AOCL BLIS + libFLAME for BLAS/LAPACK
|
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
|
||||||
## AOCC Fortran runtime: -lflang (includes FortranRuntime)
|
|
||||||
## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
|
|
||||||
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
|
|
||||||
|
|
||||||
# OpenMP flag for selective compilation
|
# OpenMP flag for selective compilation
|
||||||
OMP_FLAG = -fopenmp
|
OMP_FLAG = -fopenmp
|
||||||
|
|
||||||
|
## Memory allocator switch
|
||||||
|
## 0 (default) : use system default allocator (ptmalloc)
|
||||||
|
## 1 : use jemalloc (install jemalloc-devel first)
|
||||||
|
USE_JEMALLOC ?= 0
|
||||||
|
ifeq ($(USE_JEMALLOC),1)
|
||||||
|
LDLIBS := -ljemalloc $(LDLIBS)
|
||||||
|
endif
|
||||||
|
|
||||||
## Interp_Points load balance profiling mode
|
## Interp_Points load balance profiling mode
|
||||||
## off : (default) no load balance instrumentation
|
## off : (default) no load balance instrumentation
|
||||||
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
||||||
@@ -63,11 +68,11 @@ USE_TRANSFER_CACHE ?= auto
|
|||||||
## 0 : use original Fortran rungekutta4_rout.o
|
## 0 : use original Fortran rungekutta4_rout.o
|
||||||
USE_CXX_RK4 ?= 1
|
USE_CXX_RK4 ?= 1
|
||||||
|
|
||||||
f90 = flang
|
f90 = gfortran
|
||||||
f77 = flang
|
f77 = gfortran
|
||||||
CXX = clang++
|
CXX = g++
|
||||||
CC = clang
|
CC = gcc
|
||||||
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
|
CLINKER = mpicxx
|
||||||
|
|
||||||
Cu = nvcc
|
Cu = nvcc
|
||||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||||
|
|||||||
Reference in New Issue
Block a user