Add conservative fmisc safe mode

This commit is contained in:
2026-05-17 00:35:17 +08:00
parent 3806e82c47
commit 6f62e75245
3 changed files with 140 additions and 49 deletions

View File

@@ -312,8 +312,8 @@
end function decide3d end function decide3d
!--------------------------------------------------------------------------------------- !---------------------------------------------------------------------------------------
subroutine symmetry_bd(ord,extc,func,funcc,SoA) subroutine symmetry_bd(ord,extc,func,funcc,SoA)
implicit none implicit none
!~~~~~~> input arguments !~~~~~~> input arguments
integer,intent(in) :: ord integer,intent(in) :: ord
@@ -322,9 +322,12 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc real*8, dimension(-ord+1:extc(1),-ord+1:extc(2),-ord+1:extc(3)),intent(out):: funcc
real*8, dimension(1:3), intent(in) :: SoA real*8, dimension(1:3), intent(in) :: SoA
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func #if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
enddo enddo
@@ -337,8 +340,8 @@ subroutine symmetry_bd(ord,extc,func,funcc,SoA)
end subroutine symmetry_bd end subroutine symmetry_bd
subroutine symmetry_tbd(ord,extc,func,funcc,SoA) subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
implicit none implicit none
!~~~~~~> input arguments !~~~~~~> input arguments
integer,intent(in) :: ord integer,intent(in) :: ord
@@ -347,9 +350,12 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1)+ord,-ord+1:extc(2)+ord,-ord+1:extc(3)+ord),intent(out):: funcc real*8, dimension(-ord+1:extc(1)+ord,-ord+1:extc(2)+ord,-ord+1:extc(3)+ord),intent(out):: funcc
real*8, dimension(1:3), intent(in) :: SoA real*8, dimension(1:3), intent(in) :: SoA
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func #if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -365,8 +371,8 @@ subroutine symmetry_tbd(ord,extc,func,funcc,SoA)
end subroutine symmetry_tbd end subroutine symmetry_tbd
subroutine symmetry_stbd(ord,extc,func,funcc,SoA) subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
implicit none implicit none
!~~~~~~> input arguments !~~~~~~> input arguments
integer,intent(in) :: ord integer,intent(in) :: ord
@@ -375,9 +381,12 @@ subroutine symmetry_stbd(ord,extc,func,funcc,SoA)
real*8, dimension(-ord+1:extc(1)+ord,-ord+1:extc(2)+ord,extc(3)),intent(out):: funcc real*8, dimension(-ord+1:extc(1)+ord,-ord+1:extc(2)+ord,extc(3)),intent(out):: funcc
real*8, dimension(2), intent(in) :: SoA real*8, dimension(2), intent(in) :: SoA
integer::i integer::i
funcc(1:extc(1),1:extc(2),1:extc(3)) = func #if USE_FMISC_SAFE_MODE
funcc = 0.d0
#endif
funcc(1:extc(1),1:extc(2),1:extc(3)) = func
do i=0,ord-1 do i=0,ord-1
funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1) funcc(-i,1:extc(2),1:extc(3)) = funcc(i+2,1:extc(2),1:extc(3))*SoA(1)
funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1) funcc(extc(1)+1+i,1:extc(2),1:extc(3)) = funcc(extc(1)-1-i,1:extc(2),1:extc(3))*SoA(1)
@@ -1119,6 +1128,10 @@ end subroutine d2dump
#define POLINT6_USE_BARYCENTRIC 1 #define POLINT6_USE_BARYCENTRIC 1
#endif #endif
#ifndef USE_FMISC_SAFE_MODE
#define USE_FMISC_SAFE_MODE 0
#endif
!DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville !DIR$ ATTRIBUTES FORCEINLINE :: polint6_neville
subroutine polint6_neville(xa, ya, x, y, dy) subroutine polint6_neville(xa, ya, x, y, dy)
implicit none implicit none
@@ -1271,7 +1284,9 @@ end subroutine d2dump
real*8 :: dif, dift, hp, h, den_val real*8 :: dif, dift, hp, h, den_val
if (ordn == 6) then if (ordn == 6) then
#if POLINT6_USE_BARYCENTRIC #if USE_FMISC_SAFE_MODE
call polint6_neville(xa, ya, x, y, dy)
#elif POLINT6_USE_BARYCENTRIC
call polint6_barycentric(xa, ya, x, y, dy) call polint6_barycentric(xa, ya, x, y, dy)
#else #else
call polint6_neville(xa, ya, x, y, dy) call polint6_neville(xa, ya, x, y, dy)
@@ -1376,10 +1391,10 @@ end subroutine d2dump
real*8, intent(in) :: x1,x2 real*8, intent(in) :: x1,x2
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER #if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER)
integer :: i,m integer :: i,m
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
real*8, dimension(ordn) :: yntmp real*8, dimension(ordn) :: yntmp
m=size(x1a) m=size(x1a)
do i=1,m do i=1,m
@@ -1414,7 +1429,7 @@ end subroutine d2dump
real*8, intent(in) :: x1,x2,x3 real*8, intent(in) :: x1,x2,x3
real*8, intent(out) :: y,dy real*8, intent(out) :: y,dy
#ifdef POLINT_LEGACY_ORDER #if USE_FMISC_SAFE_MODE || defined(POLINT_LEGACY_ORDER)
integer :: i,j,m,n integer :: i,j,m,n
real*8, dimension(ordn,ordn) :: yatmp real*8, dimension(ordn,ordn) :: yatmp
real*8, dimension(ordn) :: ymtmp real*8, dimension(ordn) :: ymtmp
@@ -1502,12 +1517,23 @@ if(dabs(X(1)-xmin) < dX) imin = 1
if(dabs(Y(1)-ymin) < dY) jmin = 1 if(dabs(Y(1)-ymin) < dY) jmin = 1
if(dabs(Z(1)-zmin) < dZ) kmin = 1 if(dabs(Z(1)-zmin) < dZ) kmin = 1
! Optimized with oneMKL BLAS DDOT for dot product #if USE_FMISC_SAFE_MODE
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) f_out = 0.d0
allocate(f_flat(n_elements)) do k = kmin, kmax
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) do j = jmin, jmax
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) do i = imin, imax
deallocate(f_flat) f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#endif
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1565,7 +1591,9 @@ if(dabs(Z(1)-zmin) < dZ) kmin = 1
do k=kmin,kmax do k=kmin,kmax
do j=jmin,jmax do j=jmin,jmax
#if !USE_FMISC_SAFE_MODE
!DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7) !DIR$ SIMD REDUCTION(+:s1,s2,s3,s4,s5,s6,s7)
#endif
do i=imin,imax do i=imin,imax
s1 = s1 + f1(i,j,k)*f1(i,j,k) s1 = s1 + f1(i,j,k)*f1(i,j,k)
s2 = s2 + f2(i,j,k)*f2(i,j,k) s2 = s2 + f2(i,j,k)*f2(i,j,k)
@@ -1672,12 +1700,23 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product #if USE_FMISC_SAFE_MODE
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) f_out = 0.d0
allocate(f_flat(n_elements)) do k = kmin, kmax
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements]) do j = jmin, jmax
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1) do i = imin, imax
deallocate(f_flat) f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product
n_elements = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(n_elements))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [n_elements])
f_out = DDOT(n_elements, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#endif
f_out = f_out*dX*dY*dZ f_out = f_out*dX*dY*dZ
@@ -1769,12 +1808,23 @@ if(Symmetry==2)then
if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1 if(dabs(ymin+gw*dY)<dY.and.Y(1)<0.d0) jmin = gw+1
endif endif
! Optimized with oneMKL BLAS DDOT for dot product #if USE_FMISC_SAFE_MODE
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1) f_out = 0.d0
allocate(f_flat(Nout)) do k = kmin, kmax
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout]) do j = jmin, jmax
f_out = DDOT(Nout, f_flat, 1, f_flat, 1) do i = imin, imax
deallocate(f_flat) f_out = f_out + f(i,j,k)*f(i,j,k)
end do
end do
end do
#else
! Optimized with oneMKL BLAS DDOT for dot product
Nout = (imax-imin+1)*(jmax-jmin+1)*(kmax-kmin+1)
allocate(f_flat(Nout))
f_flat = reshape(f(imin:imax,jmin:jmax,kmin:kmax), [Nout])
f_out = DDOT(Nout, f_flat, 1, f_flat, 1)
deallocate(f_flat)
#endif
return return
@@ -1878,13 +1928,23 @@ deallocate(f_flat)
real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0 real*8,parameter::C1=3.d0/8.d0,C2=3.d0/4.d0,C3=-1.d0/8.d0
integer :: i,j,k integer :: i,j,k
#if USE_FMISC_SAFE_MODE
do k=1,ext(3)
do j=1,ext(2)
do i=1,ext(1)
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
enddo
enddo
enddo
#else
do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1)) do concurrent (k=1:ext(3), j=1:ext(2), i=1:ext(1))
fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k) fout(i,j,k) = C1*f1(i,j,k)+C2*f2(i,j,k)+C3*f3(i,j,k)
end do end do
#endif
return return
end subroutine average2 end subroutine average2
!----------------------------------------------------------------------------- !-----------------------------------------------------------------------------
subroutine average2p(ext,f1,f2,f3,fout) subroutine average2p(ext,f1,f2,f3,fout)
implicit none implicit none
@@ -2024,8 +2084,15 @@ deallocate(f_flat)
tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m) tmp1 = tmp1 + coef(ORDN+m)*tmp2(:,m)
enddo enddo
! Third dimension: x-direction weighted sum using BLAS DDOT #if USE_FMISC_SAFE_MODE
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * tmp1(m)
end do
#else
! Third dimension: x-direction weighted sum using BLAS DDOT
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#endif
return return
@@ -2091,8 +2158,15 @@ deallocate(f_flat)
tmp1 = tmp1 + coef(ORDN+m)*ya(:,m) tmp1 = tmp1 + coef(ORDN+m)*ya(:,m)
enddo enddo
! Use BLAS DDOT for final weighted sum #if USE_FMISC_SAFE_MODE
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1) f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * tmp1(m)
end do
#else
! Use BLAS DDOT for final weighted sum
f_int = DDOT(ORDN, coef(1:ORDN), 1, tmp1, 1)
#endif
return return
@@ -2184,8 +2258,15 @@ deallocate(f_flat)
write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd write(*,*)"error in global_interpind1d, not recognized dumyd = ",dumyd
endif endif
! Optimized with BLAS DDOT for weighted sum #if USE_FMISC_SAFE_MODE
f_int = DDOT(ORDN, coef, 1, ya, 1) f_int = 0.d0
do m = 1, ORDN
f_int = f_int + coef(m) * ya(m)
end do
#else
! Optimized with BLAS DDOT for weighted sum
f_int = DDOT(ORDN, coef, 1, ya, 1)
#endif
return return

View File

@@ -55,6 +55,7 @@ EM_KERNEL_FLAG = -DBSSN_USE_EM_C_KERNEL=$(EFFECTIVE_USE_CXX_EM_KERNEL)
## 0 : fallback to Neville path ## 0 : fallback to Neville path
POLINT6_USE_BARY ?= 1 POLINT6_USE_BARY ?= 1
POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY) POLINT6_FLAG = -DPOLINT6_USE_BARYCENTRIC=$(POLINT6_USE_BARY)
FMISC_SAFE_FLAG = -DUSE_FMISC_SAFE_MODE=$(USE_FMISC_SAFE_MODE)
TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE) TRANSFER_CACHE_FLAG = -DBSSN_USE_TRANSFER_CACHE=$(EFFECTIVE_USE_TRANSFER_CACHE)
ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL) ESCALAR_KERNEL_FLAG = -DBSSN_USE_ESCALAR_C_KERNEL=$(EFFECTIVE_USE_CXX_ESCALAR_KERNEL)
@@ -67,9 +68,11 @@ ifeq ($(PGO_MODE),instrument)
## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
CXXAPPFLAGS = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \ CXXAPPFLAGS = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(FMISC_SAFE_FLAG) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG) $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \ f90appflags = -O3 -march=x86-64-v4 -fma -fprofile-instr-generate -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
$(FMISC_SAFE_FLAG)
else else
## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \ ## opt (default): maximum performance with PGO profile data -fprofile-instr-use=$(PROFDATA) \
## PGO has been turned off, now tested and found to be negative optimization ## PGO has been turned off, now tested and found to be negative optimization
@@ -78,9 +81,11 @@ else
CXXAPPFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \ CXXAPPFLAGS = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \ -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
$(FMISC_SAFE_FLAG) \
$(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG) $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
f90appflags = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \ f90appflags = -O3 -march=x86-64-v4 -fp-model fast=2 -fma -ipo \
-align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG) \
$(FMISC_SAFE_FLAG)
endif endif
.SUFFIXES: .o .f90 .C .for .cu .SUFFIXES: .o .f90 .C .for .cu

View File

@@ -76,6 +76,11 @@ USE_TRANSFER_CACHE ?= auto
## 0 (default): use original Fortran rungekutta4_rout.o ## 0 (default): use original Fortran rungekutta4_rout.o
USE_CXX_RK4 ?= 0 USE_CXX_RK4 ?= 0
## fmisc conservative mode switch
## 1 : restore lower-optimization / legacy fmisc numerics
## 0 (default): keep the optimized fmisc paths
USE_FMISC_SAFE_MODE ?= 0
f90 = ifx f90 = ifx
f77 = ifx f77 = ifx
CXX = icpx CXX = icpx