Compare commits
16 Commits
aocc-legac
...
gcc-legacy
| Author | SHA1 | Date | |
|---|---|---|---|
| 12bf08a2a1 | |||
| 9b4f98e237 | |||
| 2bbde059db | |||
| 3b8774c1b1 | |||
| 23b52e30d6 | |||
| e8f590a742 | |||
| 632173ea10 | |||
| eed2ff2be8 | |||
| b904f6cf56 | |||
| c4b9bd3788 | |||
| 276b36ea25 | |||
| baf248c3bc | |||
| 70b6496ed3 | |||
| 6ca9fece2e | |||
| 516cdea502 | |||
| 9687d9a3dd |
@@ -58,10 +58,14 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
|
||||
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
|
||||
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
|
||||
|
||||
## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4)
|
||||
## make -> opt (PGO-guided, maximum performance)
|
||||
## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data)
|
||||
|
||||
## GCC build flags (optimized for x86-64-v4)
|
||||
## PGO disabled (used negative optimization on Intel; not tested on GCC)
|
||||
CXXAPPFLAGS = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||
-Dfortran3 -Dnewc $(INTERP_LB_FLAGS) \
|
||||
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
|
||||
f90appflags = -O3 -march=x86-64-v4 -ffast-math -mfma -flto \
|
||||
-cpp $(POLINT6_FLAG)
|
||||
|
||||
.SUFFIXES: .o .f90 .C .for .cu
|
||||
|
||||
@@ -69,11 +73,11 @@ ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KE
|
||||
$(f90) $(f90appflags) -c $< -o $@
|
||||
|
||||
.C.o:
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
# ShellPatch.C uses OpenMP for setupintintstuff search loops
|
||||
ShellPatch.o: ShellPatch.C
|
||||
$(CXX) $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) $(OMP_FLAG) -c $< $(filein) -o $@
|
||||
|
||||
.for.o:
|
||||
$(f77) -c $< -o $@
|
||||
@@ -83,59 +87,59 @@ ShellPatch.o: ShellPatch.C
|
||||
|
||||
# C rewrite of BSSN RHS kernel and helpers
|
||||
bssn_rhs_c.o: bssn_rhs_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fderivs_c.o: fderivs_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_c.o: fdderivs_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
kodiss_c.o: kodiss_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_c.o: lopsided_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
lopsided_kodis_c.o: lopsided_kodis_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
# C rewrite of shell-patch derivative kernels
|
||||
fderivs_sh_c.o: fderivs_sh_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_sh_c.o: fdderivs_sh_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fderivs_shc_c.o: fderivs_shc_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
fdderivs_shc_c.o: fdderivs_shc_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
kodiss_sh_c.o: kodiss_sh_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
|
||||
bssn_em_rhs_c.o: bssn_em_rhs_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
z4c_rhs_c.o: z4c_rhs_c.C
|
||||
$(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
#interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h
|
||||
# $(CXX) $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
# ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
|
||||
|
||||
## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS
|
||||
TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata
|
||||
TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
|
||||
-fprofile-instr-use=$(TP_PROFDATA) \
|
||||
-Dfortran3 -Dnewc -I$(AOCL_ROOT)/include
|
||||
-Dfortran3 -Dnewc $(filein_real)
|
||||
|
||||
TwoPunctures.o: TwoPunctures.C
|
||||
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
TwoPunctureABE.o: TwoPunctureABE.C
|
||||
$(CXX) $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@
|
||||
|
||||
# Input files
|
||||
|
||||
|
||||
@@ -1,21 +1,26 @@
|
||||
## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4)
|
||||
## GCC version with OpenMPI and OpenBLAS
|
||||
OMPI_ROOT = /usr/mpi/gcc/openmpi-4.1.9a1
|
||||
|
||||
## AOCL root path for includes and libraries
|
||||
AOCL_ROOT ?= /home/aocc/aocl/5.2.0/aocc
|
||||
## Ensure mpicxx and final executables find OpenMPI libs at build- and runtime
|
||||
export LD_LIBRARY_PATH := $(OMPI_ROOT)/lib64:$(LD_LIBRARY_PATH)
|
||||
|
||||
## AOCC-built OpenMPI prefix
|
||||
OMPI_PREFIX ?= /home/aocc/openmpi-5.0.10
|
||||
filein = -I/usr/include/ -I$(OMPI_ROOT)/include
|
||||
|
||||
filein = -I/usr/include/ -I$(AOCL_ROOT)/include
|
||||
|
||||
## Using AOCL BLIS + libFLAME for BLAS/LAPACK
|
||||
## AOCC Fortran runtime: -lflang (includes FortranRuntime)
|
||||
## AOCC OpenMP runtime: -lomp (LLVM OpenMP)
|
||||
LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp
|
||||
## OpenBLAS (OpenMP variant) + gfortran runtime
|
||||
## -Wl,-rpath ensures ABE / TwoPunctureABE find libmpi at runtime without LD_LIBRARY_PATH
|
||||
LDLIBS = -Wl,-rpath,$(OMPI_ROOT)/lib64 -lopenblaso -lgfortran -lpthread -lm -ldl -lgomp
|
||||
|
||||
# OpenMP flag for selective compilation
|
||||
OMP_FLAG = -fopenmp
|
||||
|
||||
## Memory allocator switch
|
||||
## 0 (default) : use system default allocator (ptmalloc)
|
||||
## 1 : use jemalloc (install jemalloc-devel first)
|
||||
USE_JEMALLOC ?= 0
|
||||
ifeq ($(USE_JEMALLOC),1)
|
||||
LDLIBS := -ljemalloc $(LDLIBS)
|
||||
endif
|
||||
|
||||
## Interp_Points load balance profiling mode
|
||||
## off : (default) no load balance instrumentation
|
||||
## profile : Pass 1 — instrument Interp_Points to collect timing profile
|
||||
@@ -63,11 +68,11 @@ USE_TRANSFER_CACHE ?= auto
|
||||
## 0 : use original Fortran rungekutta4_rout.o
|
||||
USE_CXX_RK4 ?= 1
|
||||
|
||||
f90 = flang
|
||||
f77 = flang
|
||||
CXX = clang++
|
||||
CC = clang
|
||||
CLINKER = $(OMPI_PREFIX)/bin/mpicxx
|
||||
f90 = gfortran
|
||||
f77 = gfortran
|
||||
CXX = g++
|
||||
CC = gcc
|
||||
CLINKER = mpicxx
|
||||
|
||||
Cu = nvcc
|
||||
CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
|
||||
|
||||
Reference in New Issue
Block a user