From 0992e2219a39dbb1fa69f3e6743948261a9c6068 Mon Sep 17 00:00:00 2001 From: CGH0S7 <776459475@qq.com> Date: Tue, 28 Apr 2026 02:19:00 +0800 Subject: [PATCH] Migrate build system from Intel oneAPI to AMD AOCC/AOCL/OpenMPI Replace Intel compilers (ifx/icpx/icx) with AOCC (flang/clang++/clang), Intel MPI (mpiicpx) with AOCC-built OpenMPI (mpicxx), and Intel MKL with AOCL BLIS/libFLAME. Replace -xHost with -march=znver4, -ipo with -flto, -fp-model fast=2 with -ffast-math, -qopenmp with -fopenmp. Remove PGO, TBB allocator, and Intel-specific runtime libraries. Fix MKL-specific includes in TwoPunctures.C and gaussj.C to use standard CBLAS/LAPACKE headers from AOCL. Co-Authored-By: Claude Opus 4.7 --- AMSS_NCKU_source/TwoPunctures.C | 2 +- AMSS_NCKU_source/gaussj.C | 2 +- AMSS_NCKU_source/makefile | 276 +++++++++++++++----------------- AMSS_NCKU_source/makefile.inc | 46 ++---- 4 files changed, 145 insertions(+), 181 deletions(-) diff --git a/AMSS_NCKU_source/TwoPunctures.C b/AMSS_NCKU_source/TwoPunctures.C index 1b6e590..9101183 100644 --- a/AMSS_NCKU_source/TwoPunctures.C +++ b/AMSS_NCKU_source/TwoPunctures.C @@ -27,7 +27,7 @@ using namespace std; #endif #include "TwoPunctures.h" -#include +#include TwoPunctures::TwoPunctures(double mp, double mm, double b, double P_plusx, double P_plusy, double P_plusz, diff --git a/AMSS_NCKU_source/gaussj.C b/AMSS_NCKU_source/gaussj.C index 86c7777..cfc78b4 100644 --- a/AMSS_NCKU_source/gaussj.C +++ b/AMSS_NCKU_source/gaussj.C @@ -18,7 +18,7 @@ using namespace std; #endif // Intel oneMKL LAPACK interface -#include +#include /* Linear equation solution using Intel oneMKL LAPACK. a[0..n-1][0..n-1] is the input matrix. b[0..n-1] is input containing the right-hand side vectors. On output a is diff --git a/AMSS_NCKU_source/makefile b/AMSS_NCKU_source/makefile index 8c82faa..8f92ece 100644 --- a/AMSS_NCKU_source/makefile +++ b/AMSS_NCKU_source/makefile @@ -1,5 +1,5 @@ - - + + include makefile.inc -include AMSS_NCKU_build.mk @@ -40,58 +40,40 @@ POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE) ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL) -## ABE build flags selected by PGO_MODE (set in makefile.inc, default: opt) -## make -> opt (PGO-guided, maximum performance) -## make PGO_MODE=instrument -> instrument (Phase 1: collect fresh profile data) -PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata - -ifeq ($(PGO_MODE),instrument) -## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability -CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ +## AMD AOCC build flags optimized for EPYC Zen 4 (-march=znver4) +CXXAPPFLAGS = -O3 -march=znver4 -ffast-math -flto \ + -Dfortran3 -Dnewc -I$(AOCL_ROOT)/include $(INTERP_LB_FLAGS) \ $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) -f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -else -## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ -## PGO has been turned off, now tested and found to be negative optimization -## INTERP_LB_FLAGS has been turned off too, now tested and found to be negative optimization +f90appflags = -O3 -march=znver4 -ffast-math -flto \ + -cpp -I$(AOCL_ROOT)/include $(POLINT6_FLAG) +.SUFFIXES: .o .f90 .C .for .cu -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ - $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -endif - -.SUFFIXES: .o .f90 .C .for .cu - -.f90.o: - $(f90) $(f90appflags) -c $< -o $@ - -.C.o: - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -.for.o: - $(f77) -c $< -o $@ - -.cu.o: - $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) - -# C rewrite of BSSN RHS kernel and helpers +.f90.o: + $(f90) $(f90appflags) -c $< -o $@ + +.C.o: + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +.for.o: + $(f77) -c $< -o $@ + +.cu.o: + $(Cu) $(CUDA_APP_FLAGS) -c $< -o $@ $(CUDA_LIB_PATH) + +# C rewrite of BSSN RHS kernel and helpers bssn_rhs_c.o: bssn_rhs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ fderivs_c.o: fderivs_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -fdderivs_c.o: fdderivs_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -kodiss_c.o: kodiss_c.C - ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - + +fdderivs_c.o: fdderivs_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + +kodiss_c.o: kodiss_c.C + ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ + lopsided_c.o: lopsided_c.C ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ @@ -100,22 +82,20 @@ lopsided_kodis_c.o: lopsided_kodis_c.C #interp_lb_profile.o: interp_lb_profile.C interp_lb_profile.h # ${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@ - -## TwoPunctureABE uses fixed optimal flags with its own PGO profile, independent of CXXAPPFLAGS -TP_PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/TwoPunctureABE.profdata -TP_OPTFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ - -fprofile-instr-use=$(TP_PROFDATA) \ - -Dfortran3 -Dnewc -I${MKLROOT}/include - -TwoPunctures.o: TwoPunctures.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ - -TwoPunctureABE.o: TwoPunctureABE.C - ${CXX} $(TP_OPTFLAGS) -qopenmp -c $< -o $@ - -# Input files - -## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) + +## TwoPunctureABE uses fixed optimal flags (AMD AOCC, no PGO) +TP_OPTFLAGS = -O3 -march=znver4 -ffast-math -flto \ + -Dfortran3 -Dnewc -I$(AOCL_ROOT)/include + +TwoPunctures.o: TwoPunctures.C + ${CXX} $(TP_OPTFLAGS) -fopenmp -c $< -o $@ + +TwoPunctureABE.o: TwoPunctureABE.C + ${CXX} $(TP_OPTFLAGS) -fopenmp -c $< -o $@ + +# Input files + +## Kernel implementation switch (set USE_CXX_KERNELS=0 to fall back to Fortran) ifeq ($(USE_CXX_KERNELS),0) # Fortran mode: no C rewrite files; bssn_rhs.o is included via F90FILES below CFILES = @@ -134,95 +114,95 @@ RK4_F90_OBJ = else RK4_F90_OBJ = rungekutta4_rout.o endif - -C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ - cgh.o bssn_class.o surface_integral.o ShellPatch.o\ - bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ - bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ - Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o - -C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ - cgh.o surface_integral.o ShellPatch.o\ - bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ - bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ - Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ - NullShellPatch2_Evo.o \ - bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o - + +C++FILES = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ + cgh.o bssn_class.o surface_integral.o ShellPatch.o\ + bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ + bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ + Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ + NullShellPatch2_Evo.o writefile_f.o interp_lb_profile.o + +C++FILES_GPU = ABE.o Ansorg.o Block.o misc.o monitor.o Parallel.o MPatch.o var.o\ + cgh.o surface_integral.o ShellPatch.o\ + bssnEScalar_class.o perf.o Z4c_class.o NullShellPatch.o\ + bssnEM_class.o cpbc_util.o z4c_rhs_point.o checkpoint.o\ + Parallel_bam.o scalar_class.o transpbh.o NullShellPatch2.o\ + NullShellPatch2_Evo.o \ + bssn_gpu_class.o bssn_step_gpu.o bssn_macro.o writefile_f.o + F90FILES_BASE = enforce_algebra.o fmisc.o initial_puncture.o prolongrestrict.o\ prolongrestrict_cell.o prolongrestrict_vertex.o\ $(RK4_F90_OBJ) diff_new.o kodiss.o kodiss_sh.o\ lopsidediff.o sommerfeld_rout.o getnp4.o diff_new_sh.o\ shellfunctions.o bssn_rhs_ss.o Set_Rho_ADM.o\ getnp4EScalar.o bssnEScalar_rhs.o bssn_constraint.o ricci_gamma.o\ - fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\ - cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ - getnpem2.o empart.o NullNews.o fourdcurvature.o\ - bssn2adm.o adm_constraint.o adm_ricci_gamma.o\ - scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ - NullNews2.o tool_f.o - -ifeq ($(USE_CXX_KERNELS),0) -# Fortran mode: include original bssn_rhs.o -F90FILES = $(F90FILES_BASE) bssn_rhs.o -else -# C++ mode (default): bssn_rhs.o replaced by C++ kernel -F90FILES = $(F90FILES_BASE) -endif - -F77FILES = zbesh.o - -AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \ -tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \ -linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \ -initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o - -TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o - -CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o - -# file dependences -$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh - -$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ - misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ - rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\ - cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ - fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ - NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ - empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ - initial_null2.h NullShellPatch2.h - -$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ - misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ - rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\ - cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ - fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ - NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ - empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ - initial_null2.h NullShellPatch2.h \ - bssn_gpu_class.h bssn_macro.h - -$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h - -$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h - -TwoPunctureFILES: TwoPunctures.h - -$(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h - -misc.o : zbesh.o - -# projects -ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) - -ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) - $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) - -TwoPunctureABE: $(TwoPunctureFILES) - $(CLINKER) $(TP_OPTFLAGS) -qopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) - -clean: - rm *.o ABE ABEGPU TwoPunctureABE make.log -f + fadmquantites_bssn.o Z4c_rhs.o Z4c_rhs_ss.o point_diff_new_sh.o\ + cpbc.o getnp4old.o NullEvol.o initial_null.o initial_maxwell.o\ + getnpem2.o empart.o NullNews.o fourdcurvature.o\ + bssn2adm.o adm_constraint.o adm_ricci_gamma.o\ + scalar_rhs.o initial_scalar.o NullEvol2.o initial_null2.o\ + NullNews2.o tool_f.o + +ifeq ($(USE_CXX_KERNELS),0) +# Fortran mode: include original bssn_rhs.o +F90FILES = $(F90FILES_BASE) bssn_rhs.o +else +# C++ mode (default): bssn_rhs.o replaced by C++ kernel +F90FILES = $(F90FILES_BASE) +endif + +F77FILES = zbesh.o + +AHFDOBJS = expansion.o expansion_Jacobian.o patch.o coords.o patch_info.o patch_interp.o patch_system.o \ +tgrid.o fd_grid.o ghost_zone.o array.o round.o norm.o fuzzy.o error_exit.o miscfp.o \ +linear_map.o cpm_map.o BH_diagnostics.o setup.o horizon_sequence.o find_horizons.o \ +initial_guess.o Newton.o Jacobian.o ilucg.o IntPnts0.o IntPnts.o + +TwoPunctureFILES = TwoPunctureABE.o TwoPunctures.o + +CUDAFILES = bssn_gpu.o bssn_gpu_rhs_ss.o + +# file dependences +$(C++FILES) $(C++FILES_GPU) $(F90FILES) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.fh + +$(C++FILES): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ + misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ + rungekutta4_rout.h var.h bssn_class.h bssn_rhs.h sommerfeld_rout.h\ + cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ + fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ + NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ + empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ + initial_null2.h NullShellPatch2.h + +$(C++FILES_GPU): Block.h enforce_algebra.h fmisc.h initial_puncture.h macrodef.h\ + misc.h monitor.h MyList.h Parallel.h MPatch.h prolongrestrict.h\ + rungekutta4_rout.h var.h bssn_rhs.h sommerfeld_rout.h\ + cgh.h surface_integral.h ShellPatch.h shellfunctions.h perf.h\ + fadmquantites_bssn.h cpbc.h getnp4.h initial_null.h NullEvol.h\ + NullShellPatch.h initial_maxwell.h bssnEM_class.h getnpem2.h\ + empart.h NullNews.h kodiss.h Parallel_bam.h ricci_gamma.h\ + initial_null2.h NullShellPatch2.h \ + bssn_gpu_class.h bssn_macro.h + +$(AHFDOBJS): cctk.h cctk_Config.h cctk_Types.h cctk_Constants.h myglobal.h + +$(C++FILES) $(C++FILES_GPU) $(CFILES) $(AHFDOBJS) $(CUDAFILES): macrodef.h + +TwoPunctureFILES: TwoPunctures.h + +$(CUDAFILES): bssn_gpu.h gpu_mem.h gpu_rhsSS_mem.h + +misc.o : zbesh.o + +# projects +ABE: $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(LDLIBS) + +ABEGPU: $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) + $(CLINKER) $(CXXAPPFLAGS) -o $@ $(C++FILES_GPU) $(CFILES) $(F90FILES) $(F77FILES) $(AHFDOBJS) $(CUDAFILES) $(LDLIBS) + +TwoPunctureABE: $(TwoPunctureFILES) + $(CLINKER) $(TP_OPTFLAGS) -fopenmp -o $@ $(TwoPunctureFILES) $(LDLIBS) + +clean: + rm *.o ABE ABEGPU TwoPunctureABE make.log -f diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 353c1bf..aa13827 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -1,33 +1,17 @@ -## GCC version (commented out) -## filein = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -## filein = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/ -## LDLIBS = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran +## AMD AOCC version with AOCL (Optimized for AMD EPYC Zen 4) -## Intel oneAPI version with oneMKL (Optimized for performance) -filein = -I/usr/include/ -I${MKLROOT}/include +## AOCL root path for includes and libraries +AOCL_ROOT ?= /home/gh0s7/AOCC/aocl/5.2.0/aocc -## Using sequential MKL (OpenMP disabled for better single-threaded performance) -## Added -lifcore for Intel Fortran runtime and -limf for Intel math library -LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5 +## AOCC-built OpenMPI prefix +OMPI_PREFIX ?= /home/gh0s7/AOCC/aocc-openmpi -## Memory allocator switch -## 1 (default) : link Intel oneTBB allocator (libtbbmalloc) -## 0 : use system default allocator (ptmalloc) -USE_TBBMALLOC ?= 1 -TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so -ifneq ($(wildcard $(TBBMALLOC_SO)),) -TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed -else -TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed -endif -ifeq ($(USE_TBBMALLOC),1) -LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS) -endif +filein = -I/usr/include/ -I$(AOCL_ROOT)/include -## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags) -## opt : (default) maximum performance with PGO profile-guided optimization -## instrument : PGO Phase 1 instrumentation to collect fresh profile data -PGO_MODE ?= opt +## Using AOCL BLIS + libFLAME for BLAS/LAPACK +## AOCC Fortran runtime: -lflang (includes FortranRuntime) +## AOCC OpenMP runtime: -lomp (LLVM OpenMP) +LDLIBS = -L$(AOCL_ROOT)/lib -lblis -lflame -lamdlibm -lflang -lpgmath -lpthread -lm -ldl -lomp ## Interp_Points load balance profiling mode ## off : (default) no load balance instrumentation @@ -65,11 +49,11 @@ USE_TRANSFER_CACHE ?= auto ## 0 : use original Fortran rungekutta4_rout.o USE_CXX_RK4 ?= 1 -f90 = ifx -f77 = ifx -CXX = icpx -CC = icx -CLINKER = mpiicpx +f90 = flang +f77 = flang +CXX = clang++ +CC = clang +CLINKER = $(OMPI_PREFIX)/bin/mpicxx Cu = nvcc CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include