From f345b0e520402cc5e97a0a102560573353e29fb3 Mon Sep 17 00:00:00 2001 From: ianchb Date: Sat, 7 Feb 2026 14:46:46 +0800 Subject: [PATCH] Performance optimization for the TwoPunctures module * Re-enabled OpenMP. 1. Batch spectral derivatives (Chebyshev & Fourier) via precomputed matrices: Chebyshev/Fourier transforms and derivatives are precomputed as explicit physical-space operator matrices. Batch DGEMM now applies to entire tensor fields, mathematically identical to original per-line transforms but vastly faster. 2. Gauss-Seidel relaxation & tridiagonal solver workspace reuse: Per-thread reusable workspaces replace per-call heap new/delete in all tridiagonal and relaxation routines. 3. Efficient OpenMP multithreading throughout relaxation/deriv: relax_omp and friends parallelize over grouped lines/planes, maximizing threading efficiency and memory independence. Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com> --- AMSS_NCKU_source/TwoPunctures.C | 1074 +++++++++++++++++++++++++------ AMSS_NCKU_source/TwoPunctures.h | 53 +- AMSS_NCKU_source/makefile.inc | 5 +- 3 files changed, 917 insertions(+), 215 deletions(-) diff --git a/AMSS_NCKU_source/TwoPunctures.C b/AMSS_NCKU_source/TwoPunctures.C index a5b0c85..ea84474 100644 --- a/AMSS_NCKU_source/TwoPunctures.C +++ b/AMSS_NCKU_source/TwoPunctures.C @@ -60,6 +60,10 @@ TwoPunctures::TwoPunctures(double mp, double mm, double b, F = dvector(0, ntotal - 1); allocate_derivs(&u, ntotal); allocate_derivs(&v, ntotal); + D1_A = NULL; D2_A = NULL; D1_B = NULL; D2_B = NULL; + DF1_phi = NULL; DF2_phi = NULL; + precompute_derivative_matrices(); + allocate_workspace(); } TwoPunctures::~TwoPunctures() @@ -67,6 +71,13 @@ TwoPunctures::~TwoPunctures() free_dvector(F, 0, ntotal - 1); free_derivs(&u, ntotal); free_derivs(&v, ntotal); + free_workspace(); + if (D1_A) delete[] D1_A; + if (D2_A) delete[] D2_A; + if (D1_B) delete[] D1_B; + if (D2_B) delete[] D2_B; + if (DF1_phi) delete[] DF1_phi; + if (DF2_phi) delete[] DF2_phi; } void TwoPunctures::Solve() @@ -303,7 +314,7 @@ void TwoPunctures::set_initial_guess(derivs v) v.d0[indx] = 0; // set initial guess 0 v.d0[indx] /= (-cos(Pih * (2 * i + 1) / n1) - 1.0); // PRD 70, 064011 (2004) Eq.(5), from u to U } - Derivatives_AB3(nvar, n1, n2, n3, v); + Derivatives_AB3_MatMul(nvar, n1, n2, n3, v); if (0) { debug_file = fopen("initial.dat", "w"); @@ -1284,9 +1295,6 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, derivs U; double *sources; - values = dvector(0, nvar - 1); - allocate_derivs(&U, nvar); - sources = (double *)calloc(n1 * n2 * n3, sizeof(double)); if (0) { @@ -1343,7 +1351,7 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, for (k = 0; k < n3; k++) sources[Index(0, i, j, k, 1, n1, n2, n3)] = 0.0; - Derivatives_AB3(nvar, n1, n2, n3, v); + Derivatives_AB3_MatMul(nvar, n1, n2, n3, v); double psi, psi2, psi4, psi7, r_plus, r_minus; FILE *debugfile = NULL; if (0) @@ -1351,12 +1359,22 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, debugfile = fopen("res.dat", "w"); assert(debugfile); } + #pragma omp parallel for collapse(3) schedule(static) \ + private(i, j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1, \ + psi, psi2, psi4, psi7, r_plus, r_minus) for (i = 0; i < n1; i++) { for (j = 0; j < n2; j++) { for (k = 0; k < n3; k++) { + double l_values[1]; // nvar=1, stack-allocated + derivs l_U; + double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1]; + double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1]; + l_U.d0 = l_U_d0; l_U.d1 = l_U_d1; l_U.d2 = l_U_d2; l_U.d3 = l_U_d3; + l_U.d11 = l_U_d11; l_U.d12 = l_U_d12; l_U.d13 = l_U_d13; + l_U.d22 = l_U_d22; l_U.d23 = l_U_d23; l_U.d33 = l_U_d33; al = Pih * (2 * i + 1) / n1; A = -cos(al); @@ -1368,72 +1386,36 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - U.d0[ivar] = Am1 * v.d0[indx]; /* U*/ - U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx]; /* U_A*/ - U.d2[ivar] = Am1 * v.d2[indx]; /* U_B*/ - U.d3[ivar] = Am1 * v.d3[indx]; /* U_3*/ - U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx]; /* U_AA*/ - U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx]; /* U_AB*/ - U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx]; /* U_AB*/ - U.d22[ivar] = Am1 * v.d22[indx]; /* U_BB*/ - U.d23[ivar] = Am1 * v.d23[indx]; /* U_B3*/ - U.d33[ivar] = Am1 * v.d33[indx]; /* U_33*/ + l_U.d0[ivar] = Am1 * v.d0[indx]; + l_U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx]; + l_U.d2[ivar] = Am1 * v.d2[indx]; + l_U.d3[ivar] = Am1 * v.d3[indx]; + l_U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx]; + l_U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx]; + l_U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx]; + l_U.d22[ivar] = Am1 * v.d22[indx]; + l_U.d23[ivar] = Am1 * v.d23[indx]; + l_U.d33[ivar] = Am1 * v.d33[indx]; } - /* Calculation of (X,R) and*/ - /* (U_X, U_R, U_3, U_XX, U_XR, U_X3, U_RR, U_R3, U_33)*/ - AB_To_XR(nvar, A, B, &X, &R, U); - /* Calculation of (x,r) and*/ - /* (U, U_x, U_r, U_3, U_xx, U_xr, U_x3, U_rr, U_r3, U_33)*/ - C_To_c(nvar, X, R, &x, &r, U); - /* Calculation of (y,z) and*/ - /* (U, U_x, U_y, U_z, U_xx, U_xy, U_xz, U_yy, U_yz, U_zz)*/ - rx3_To_xyz(nvar, x, r, phi, &y, &z, U); + AB_To_XR(nvar, A, B, &X, &R, l_U); + C_To_c(nvar, X, R, &x, &r, l_U); + rx3_To_xyz(nvar, x, r, phi, &y, &z, l_U); NonLinEquations(sources[Index(0, i, j, k, 1, n1, n2, n3)], - A, B, X, R, x, r, phi, y, z, U, values); + A, B, X, R, x, r, phi, y, z, l_U, l_values); for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - F[indx] = values[ivar] * FAC; - /* if ((i<5) && ((j<5) || (j>n2-5)))*/ - /* F[indx] = 0.0;*/ - u.d0[indx] = U.d0[ivar]; /* U*/ - u.d1[indx] = U.d1[ivar]; /* U_x*/ - u.d2[indx] = U.d2[ivar]; /* U_y*/ - u.d3[indx] = U.d3[ivar]; /* U_z*/ - u.d11[indx] = U.d11[ivar]; /* U_xx*/ - u.d12[indx] = U.d12[ivar]; /* U_xy*/ - u.d13[indx] = U.d13[ivar]; /* U_xz*/ - u.d22[indx] = U.d22[ivar]; /* U_yy*/ - u.d23[indx] = U.d23[ivar]; /* U_yz*/ - u.d33[indx] = U.d33[ivar]; /* U_zz*/ - } - if (debugfile && (k == 0)) - { - r_plus = sqrt((x - par_b) * (x - par_b) + y * y + z * z); - r_minus = sqrt((x + par_b) * (x + par_b) + y * y + z * z); - psi = 1. + - 0.5 * par_m_plus / r_plus + - 0.5 * par_m_minus / r_minus + - U.d0[0]; - psi2 = psi * psi; - psi4 = psi2 * psi2; - psi7 = psi * psi2 * psi4; - fprintf(debugfile, - "%.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g\n", - (double)x, (double)y, (double)A, (double)B, - (double)(U.d11[0] + - U.d22[0] + - U.d33[0] + - /* 0.125 * BY_KKofxyz (x, y, z) / psi7 +*/ - (2.0 * Pi / psi2 / psi * sources[indx]) * FAC), - (double)((U.d11[0] + - U.d22[0] + - U.d33[0]) * - FAC), - (double)(-(2.0 * Pi / psi2 / psi * sources[indx]) * FAC), - (double)sources[indx] - /*(double)F[indx]*/ - ); + F[indx] = l_values[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be); + u.d0[indx] = l_U.d0[ivar]; + u.d1[indx] = l_U.d1[ivar]; + u.d2[indx] = l_U.d2[ivar]; + u.d3[indx] = l_U.d3[ivar]; + u.d11[indx] = l_U.d11[ivar]; + u.d12[indx] = l_U.d12[ivar]; + u.d13[indx] = l_U.d13[ivar]; + u.d22[indx] = l_U.d22[ivar]; + u.d23[indx] = l_U.d23[ivar]; + u.d33[indx] = l_U.d33[ivar]; } } } @@ -1443,8 +1425,6 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F, fclose(debugfile); } free(sources); - free_dvector(values, 0, nvar - 1); - free_derivs(&U, nvar); } /* --------------------------------------------------------------------------*/ double TwoPunctures::norm_inf(double const *F, int const ntotal) @@ -1544,7 +1524,7 @@ int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const for (int j = 0; j < ntotal; j++) ph.d0[j] = 0; for (int j = 0; j < NRELAX; j++) /* solves JFD*ph = p by relaxation*/ - relax(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD); + relax_omp(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD); J_times_dv(nvar, n1, n2, n3, ph, vv, u); /* vv=J*ph*/ alpha = rho / scalarproduct(rt, vv, ntotal); @@ -1570,7 +1550,7 @@ int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const for (int j = 0; j < ntotal; j++) sh.d0[j] = 0; for (int j = 0; j < NRELAX; j++) /* solves JFD*sh = s by relaxation*/ - relax(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD); + relax_omp(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD); J_times_dv(nvar, n1, n2, n3, sh, t, u); /* t=J*sh*/ omega = scalarproduct(t, s, ntotal) / scalarproduct(t, t, ntotal); @@ -1845,21 +1825,32 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl /* (u.d1[], u.d2[], u.d3[], u.d11[], u.d12[], u.d13[], u.d22[], u.d23[], u.d33[])*/ /* at interior points and at the boundaries "+/-"*/ int i, j, k, ivar, indx; - double al, be, A, B, X, R, x, r, phi, y, z, Am1, *values; - derivs dU, U; + double al, be, A, B, X, R, x, r, phi, y, z, Am1; - Derivatives_AB3(nvar, n1, n2, n3, dv); + Derivatives_AB3_MatMul(nvar, n1, n2, n3, dv); + #pragma omp parallel for schedule(static) \ + private(j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1) for (i = 0; i < n1; i++) { - values = dvector(0, nvar - 1); - allocate_derivs(&dU, nvar); - allocate_derivs(&U, nvar); + // Thread-local derivs on stack (nvar=1) + double l_val[1]; + double l_dU_d0[1], l_dU_d1[1], l_dU_d2[1], l_dU_d3[1]; + double l_dU_d11[1], l_dU_d12[1], l_dU_d13[1], l_dU_d22[1], l_dU_d23[1], l_dU_d33[1]; + double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1]; + double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1]; + derivs l_dU, l_U; + l_dU.d0=l_dU_d0; l_dU.d1=l_dU_d1; l_dU.d2=l_dU_d2; l_dU.d3=l_dU_d3; + l_dU.d11=l_dU_d11; l_dU.d12=l_dU_d12; l_dU.d13=l_dU_d13; + l_dU.d22=l_dU_d22; l_dU.d23=l_dU_d23; l_dU.d33=l_dU_d33; + l_U.d0=l_U_d0; l_U.d1=l_U_d1; l_U.d2=l_U_d2; l_U.d3=l_U_d3; + l_U.d11=l_U_d11; l_U.d12=l_U_d12; l_U.d13=l_U_d13; + l_U.d22=l_U_d22; l_U.d23=l_U_d23; l_U.d33=l_U_d33; + for (j = 0; j < n2; j++) { for (k = 0; k < n3; k++) { - al = Pih * (2 * i + 1) / n1; A = -cos(al); be = Pih * (2 * j + 1) / n2; @@ -1870,104 +1861,193 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - dU.d0[ivar] = Am1 * dv.d0[indx]; /* dU*/ - dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx]; /* dU_A*/ - dU.d2[ivar] = Am1 * dv.d2[indx]; /* dU_B*/ - dU.d3[ivar] = Am1 * dv.d3[indx]; /* dU_3*/ - dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx]; /* dU_AA*/ - dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx]; /* dU_AB*/ - dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx]; /* dU_AB*/ - dU.d22[ivar] = Am1 * dv.d22[indx]; /* dU_BB*/ - dU.d23[ivar] = Am1 * dv.d23[indx]; /* dU_B3*/ - dU.d33[ivar] = Am1 * dv.d33[indx]; /* dU_33*/ - U.d0[ivar] = u.d0[indx]; /* U */ - U.d1[ivar] = u.d1[indx]; /* U_x*/ - U.d2[ivar] = u.d2[indx]; /* U_y*/ - U.d3[ivar] = u.d3[indx]; /* U_z*/ - U.d11[ivar] = u.d11[indx]; /* U_xx*/ - U.d12[ivar] = u.d12[indx]; /* U_xy*/ - U.d13[ivar] = u.d13[indx]; /* U_xz*/ - U.d22[ivar] = u.d22[indx]; /* U_yy*/ - U.d23[ivar] = u.d23[indx]; /* U_yz*/ - U.d33[ivar] = u.d33[indx]; /* U_zz*/ + l_dU.d0[ivar] = Am1 * dv.d0[indx]; + l_dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx]; + l_dU.d2[ivar] = Am1 * dv.d2[indx]; + l_dU.d3[ivar] = Am1 * dv.d3[indx]; + l_dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx]; + l_dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx]; + l_dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx]; + l_dU.d22[ivar] = Am1 * dv.d22[indx]; + l_dU.d23[ivar] = Am1 * dv.d23[indx]; + l_dU.d33[ivar] = Am1 * dv.d33[indx]; + l_U.d0[ivar] = u.d0[indx]; + l_U.d1[ivar] = u.d1[indx]; + l_U.d2[ivar] = u.d2[indx]; + l_U.d3[ivar] = u.d3[indx]; + l_U.d11[ivar] = u.d11[indx]; + l_U.d12[ivar] = u.d12[indx]; + l_U.d13[ivar] = u.d13[indx]; + l_U.d22[ivar] = u.d22[indx]; + l_U.d23[ivar] = u.d23[indx]; + l_U.d33[ivar] = u.d33[indx]; } - /* Calculation of (X,R) and*/ - /* (dU_X, dU_R, dU_3, dU_XX, dU_XR, dU_X3, dU_RR, dU_R3, dU_33)*/ - AB_To_XR(nvar, A, B, &X, &R, dU); - /* Calculation of (x,r) and*/ - /* (dU, dU_x, dU_r, dU_3, dU_xx, dU_xr, dU_x3, dU_rr, dU_r3, dU_33)*/ - C_To_c(nvar, X, R, &x, &r, dU); - /* Calculation of (y,z) and*/ - /* (dU, dU_x, dU_y, dU_z, dU_xx, dU_xy, dU_xz, dU_yy, dU_yz, dU_zz)*/ - rx3_To_xyz(nvar, x, r, phi, &y, &z, dU); - LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values); + AB_To_XR(nvar, A, B, &X, &R, l_dU); + C_To_c(nvar, X, R, &x, &r, l_dU); + rx3_To_xyz(nvar, x, r, phi, &y, &z, l_dU); + LinEquations(A, B, X, R, x, r, phi, y, z, l_dU, l_U, l_val); for (ivar = 0; ivar < nvar; ivar++) { indx = Index(ivar, i, j, k, nvar, n1, n2, n3); - Jdv[indx] = values[ivar] * FAC; + Jdv[indx] = l_val[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be); } } } - free_dvector(values, 0, nvar - 1); - free_derivs(&dU, nvar); - free_derivs(&U, nvar); } } /* --------------------------------------------------------------------------*/ -void TwoPunctures::relax(double *dv, int const nvar, int const n1, int const n2, int const n3, +/* -------------------------------------------------------------------------- + * relax_omp: OpenMP-parallelized replacement for relax() + * + * Parallelism analysis: + * - The red-black ordering within each phi-plane means that + * same-parity lines in the i-direction are INDEPENDENT of each other + * (they only couple through the j-direction which is solved internally). + * - Similarly, same-parity lines in the j-direction are independent. + * - Different phi-planes (k) with same parity are independent. + * + * Strategy: + * - Parallelize the i-loop within each (k, parity) group for LineRelax_be + * - Parallelize the j-loop within each (k, parity) group for LineRelax_al + * - Each thread uses its own pre-allocated workspace (tid-indexed) + * --------------------------------------------------------------------------*/ +void TwoPunctures::relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3, double const *rhs, int const *ncols, int **cols, double **JFD) { - int i, j, k, n; + int n; - for (k = 0; k < n3; k = k + 2) - { + // 偶数k平面 for (n = 0; n < N_PlaneRelax; n++) { - for (i = 2; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (i = 1; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 1; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 0; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); + // 偶数i线,所有偶数k —— 不同k平面完全独立 + int n_even_k = (n3 + 1) / 2; // 偶数k的数量 + int n_even_i = (n1 - 2 + 1) / 2; // i=2,4,...的数量 + int total_tasks = n_even_k * n_even_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_i; + int ii = task % n_even_i; + int k = ki * 2; + int i = 2 + ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数i线,所有偶数k + int n_odd_i = n1 / 2; // i=1,3,...的数量 + total_tasks = n_even_k * n_odd_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_i; + int ii = task % n_odd_i; + int k = ki * 2; + int i = 1 + ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数j线,所有偶数k + int n_odd_j = (n2 - 1 + 1) / 2; + total_tasks = n_even_k * n_odd_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_j; + int ji = task % n_odd_j; + int k = ki * 2; + int j = 1 + ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 偶数j线,所有偶数k + int n_even_j = (n2 + 1) / 2; + total_tasks = n_even_k * n_even_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_j; + int ji = task % n_even_j; + int k = ki * 2; + int j = ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数k平面 — 同样的四步 + int n_odd_k = n3 / 2; + + // 偶数i线,所有奇数k + n_even_i = (n1 + 1) / 2; // i=0,2,... + total_tasks = n_odd_k * n_even_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_i; + int ii = task % n_even_i; + int k = 1 + ki * 2; + int i = ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数i线,所有奇数k + total_tasks = n_odd_k * n_odd_i; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_i; + int ii = task % n_odd_i; + int k = 1 + ki * 2; + int i = 1 + ii * 2; + LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 奇数j线,所有奇数k + total_tasks = n_odd_k * n_odd_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_odd_j; + int ji = task % n_odd_j; + int k = 1 + ki * 2; + int j = 1 + ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } + + // 偶数j线,所有奇数k + total_tasks = n_odd_k * n_even_j; + + #pragma omp parallel for schedule(static) + for (int task = 0; task < total_tasks; task++) { + int tid = omp_get_thread_num(); + int ki = task / n_even_j; + int ji = task % n_even_j; + int k = 1 + ki * 2; + int j = ji * 2; + LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid); + } } - } - for (k = 1; k < n3; k = k + 2) - { - for (n = 0; n < N_PlaneRelax; n++) - { - for (i = 0; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (i = 1; i < n1; i = i + 2) - LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 1; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - for (j = 0; j < n2; j = j + 2) - LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD); - } - } } /* --------------------------------------------------------------------------*/ -void TwoPunctures::LineRelax_be(double *dv, +void TwoPunctures::LineRelax_be_omp(double *dv, int const i, int const k, int const nvar, int const n1, int const n2, int const n3, double const *rhs, int const *ncols, int **cols, - double **JFD) + double **JFD, int tid) { int j, m, Ic, Ip, Im, col, ivar; - double *diag = new double[n2]; - double *e = new double[n2 - 1]; /* above diagonal */ - double *f = new double[n2 - 1]; /* below diagonal */ - double *b = new double[n2]; /* rhs */ - double *x = new double[n2]; /* solution vector */ - - // gsl_vector *diag = gsl_vector_alloc(n2); - // gsl_vector *e = gsl_vector_alloc(n2-1); /* above diagonal */ - // gsl_vector *f = gsl_vector_alloc(n2-1); /* below diagonal */ - // gsl_vector *b = gsl_vector_alloc(n2); /* rhs */ - // gsl_vector *x = gsl_vector_alloc(n2); /* solution vector */ + // Use pre-allocated per-thread workspace instead of new/delete + double *diag = ws_diag_be[tid]; + double *e = ws_e_be[tid]; + double *f = ws_f_be[tid]; + double *b = ws_b_be[tid]; + double *x = ws_x_be[tid]; for (ivar = 0; ivar < nvar; ivar++) { @@ -2007,14 +2087,8 @@ void TwoPunctures::LineRelax_be(double *dv, } } } - // A x = b - // A = ( d_0 e_0 0 0 ) - // ( f_0 d_1 e_1 0 ) - // ( 0 f_1 d_2 e_2 ) - // ( 0 0 f_2 d_3 ) - // - ThomasAlgorithm(n2, f, diag, e, x, b); - // gsl_linalg_solve_tridiag(diag, e, f, b, x); + ThomasAlgorithm_ws(n2, f, diag, e, x, b, + ws_l_be[tid], ws_u_be[tid], ws_d_be[tid], ws_y_be[tid]); for (j = 0; j < n2; j++) { Ic = Index(ivar, i, j, k, nvar, n1, n2, n3); @@ -2022,17 +2096,7 @@ void TwoPunctures::LineRelax_be(double *dv, // dv[Ic] = gsl_vector_get(x, j); } } - - delete[] diag; - delete[] e; - delete[] f; - delete[] b; - delete[] x; - // gsl_vector_free(diag); - // gsl_vector_free(e); - // gsl_vector_free(f); - // gsl_vector_free(b); - // gsl_vector_free(x); + // No delete — workspace is persistent } /* --------------------------------------------------------------------------*/ void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, @@ -2194,25 +2258,19 @@ void TwoPunctures::LinEquations(double A, double B, double X, double R, values[0] = dU.d11[0] + dU.d22[0] + dU.d33[0] - 0.875 * BY_KKofxyz(x, y, z) / psi8 * dU.d0[0]; } /* -------------------------------------------------------------------------*/ -void TwoPunctures::LineRelax_al(double *dv, +void TwoPunctures::LineRelax_al_omp(double *dv, int const j, int const k, int const nvar, int const n1, int const n2, int const n3, double const *rhs, int const *ncols, - int **cols, double **JFD) + int **cols, double **JFD, int tid) { int i, m, Ic, Ip, Im, col, ivar; - double *diag = new double[n1]; - double *e = new double[n1 - 1]; /* above diagonal */ - double *f = new double[n1 - 1]; /* below diagonal */ - double *b = new double[n1]; /* rhs */ - double *x = new double[n1]; /* solution vector */ - - // gsl_vector *diag = gsl_vector_alloc(n1); - // gsl_vector *e = gsl_vector_alloc(n1-1); /* above diagonal */ - // gsl_vector *f = gsl_vector_alloc(n1-1); /* below diagonal */ - // gsl_vector *b = gsl_vector_alloc(n1); /* rhs */ - // gsl_vector *x = gsl_vector_alloc(n1); /* solution vector */ + double *diag = ws_diag_al[tid]; + double *e = ws_e_al[tid]; + double *f = ws_f_al[tid]; + double *b = ws_b_al[tid]; + double *x = ws_x_al[tid]; for (ivar = 0; ivar < nvar; ivar++) { @@ -2252,8 +2310,8 @@ void TwoPunctures::LineRelax_al(double *dv, } } } - ThomasAlgorithm(n1, f, diag, e, x, b); - // gsl_linalg_solve_tridiag(diag, e, f, b, x); + ThomasAlgorithm_ws(n1, f, diag, e, x, b, + ws_l_al[tid], ws_u_al[tid], ws_d_al[tid], ws_y_al[tid]); for (i = 0; i < n1; i++) { Ic = Index(ivar, i, j, k, nvar, n1, n2, n3); @@ -2261,18 +2319,6 @@ void TwoPunctures::LineRelax_al(double *dv, // dv[Ic] = gsl_vector_get(x, i); } } - - delete[] diag; - delete[] e; - delete[] f; - delete[] b; - delete[] x; - - // gsl_vector_free(diag); - // gsl_vector_free(e); - // gsl_vector_free(f); - // gsl_vector_free(b); - // gsl_vector_free(x); } /* -------------------------------------------------------------------------*/ // a[N], b[N-1], c[N-1], x[N], q[N] @@ -2323,6 +2369,37 @@ void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, doubl return; } + +// ThomasAlgorithm with pre-allocated workspace (no new/delete) +// l[N-1], u_ws[N-1], d[N], y[N] are caller-provided workspace +void TwoPunctures::ThomasAlgorithm_ws(int N, double *b, double *a, double *c, + double *x, double *q, + double *l, double *u_ws, double *d, double *y) +{ + /* LU Decomposition */ + d[0] = a[0]; + u_ws[0] = c[0]; + + for (int i = 0; i < N - 2; i++) { + l[i] = b[i] / d[i]; + d[i + 1] = a[i + 1] - l[i] * u_ws[i]; + u_ws[i + 1] = c[i + 1]; + } + + l[N - 2] = b[N - 2] / d[N - 2]; + d[N - 1] = a[N - 1] - l[N - 2] * u_ws[N - 2]; + + /* Forward Substitution [L][y] = [q] */ + y[0] = q[0]; + for (int i = 1; i < N; i++) + y[i] = q[i] - l[i - 1] * y[i - 1]; + + /* Backward Substitution [U][x] = [y] */ + x[N - 1] = y[N - 1] / d[N - 1]; + for (int i = N - 2; i >= 0; i--) + x[i] = (y[i] - u_ws[i] * x[i + 1]) / d[i]; +} + // --------------------------------------------------------------------------*/ // Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/ /* --------------------------------------------------------------------------*/ @@ -2512,3 +2589,606 @@ void TwoPunctures::SpecCoef(parameters par, int ivar, double *v, double *cf) free_d3tensor(values3, 0, n1, 0, n2, 0, n3); free_d3tensor(values4, 0, n1, 0, n2, 0, n3); } + +void TwoPunctures::allocate_workspace() +{ + int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi; + max_threads = omp_get_max_threads(); + printf("Allocating workspace for %d threads\n", max_threads); + + // LineRelax_be workspace: arrays of size n2, per thread + ws_diag_be = new double*[max_threads]; + ws_e_be = new double*[max_threads]; + ws_f_be = new double*[max_threads]; + ws_b_be = new double*[max_threads]; + ws_x_be = new double*[max_threads]; + ws_l_be = new double*[max_threads]; + ws_u_be = new double*[max_threads]; + ws_d_be = new double*[max_threads]; + ws_y_be = new double*[max_threads]; + + // LineRelax_al workspace: arrays of size n1, per thread + ws_diag_al = new double*[max_threads]; + ws_e_al = new double*[max_threads]; + ws_f_al = new double*[max_threads]; + ws_b_al = new double*[max_threads]; + ws_x_al = new double*[max_threads]; + ws_l_al = new double*[max_threads]; + ws_u_al = new double*[max_threads]; + ws_d_al = new double*[max_threads]; + ws_y_al = new double*[max_threads]; + + int N = (n1 > n2) ? n1 : n2; // max of n1, n2 + + for (int t = 0; t < max_threads; t++) { + ws_diag_be[t] = new double[n2]; + ws_e_be[t] = new double[n2]; + ws_f_be[t] = new double[n2]; + ws_b_be[t] = new double[n2]; + ws_x_be[t] = new double[n2]; + ws_l_be[t] = new double[n2]; + ws_u_be[t] = new double[n2]; + ws_d_be[t] = new double[n2]; + ws_y_be[t] = new double[n2]; + + ws_diag_al[t] = new double[n1]; + ws_e_al[t] = new double[n1]; + ws_f_al[t] = new double[n1]; + ws_b_al[t] = new double[n1]; + ws_x_al[t] = new double[n1]; + ws_l_al[t] = new double[n1]; + ws_u_al[t] = new double[n1]; + ws_d_al[t] = new double[n1]; + ws_y_al[t] = new double[n1]; + } +} + +void TwoPunctures::free_workspace() +{ + for (int t = 0; t < max_threads; t++) { + delete[] ws_diag_be[t]; delete[] ws_e_be[t]; delete[] ws_f_be[t]; + delete[] ws_b_be[t]; delete[] ws_x_be[t]; + delete[] ws_l_be[t]; delete[] ws_u_be[t]; + delete[] ws_d_be[t]; delete[] ws_y_be[t]; + + delete[] ws_diag_al[t]; delete[] ws_e_al[t]; delete[] ws_f_al[t]; + delete[] ws_b_al[t]; delete[] ws_x_al[t]; + delete[] ws_l_al[t]; delete[] ws_u_al[t]; + delete[] ws_d_al[t]; delete[] ws_y_al[t]; + } + delete[] ws_diag_be; delete[] ws_e_be; delete[] ws_f_be; + delete[] ws_b_be; delete[] ws_x_be; + delete[] ws_l_be; delete[] ws_u_be; + delete[] ws_d_be; delete[] ws_y_be; + + delete[] ws_diag_al; delete[] ws_e_al; delete[] ws_f_al; + delete[] ws_b_al; delete[] ws_x_al; + delete[] ws_l_al; delete[] ws_u_al; + delete[] ws_d_al; delete[] ws_y_al; +} + +/*========================================================================== + * Precomputed Spectral Derivative Matrices + * + * Mathematical equivalence proof: + * + * Original algorithm (per-line): + * 1. Forward Chebyshev transform: c = T * f (where T is the DCT matrix) + * 2. Spectral derivative: c' = Dhat * c (recurrence relation) + * 3. Inverse transform: f' = T^{-1} * c' + * Combined: f' = T^{-1} * Dhat * T * f = D * f + * + * The matrix D = T^{-1} * Dhat * T is precomputed once. + * Similarly D2 = T^{-1} * Dhat^2 * T for second derivatives. + * + * For Fourier: same idea with DFT matrices and frequency-domain derivatives. + * + * This converts n2*n3 separate O(n1^2) transforms into a single + * (n1 x n1) * (n1 x n2*n3) DGEMM call, which is BLAS Level-3 + * and thus optimally parallelized by MKL. + *=========================================================================*/ + +void TwoPunctures::build_cheb_deriv_matrices(int n, double *D1, double *D2) +{ + /* Build the physical-space derivative matrices for Chebyshev Zeros grid. + * + * Grid points: x_i = -cos(pi*(2i+1)/(2n)), i=0,...,n-1 + * + * Method: Construct T (forward transform), Dhat (spectral derivative), + * T^{-1} (inverse transform), then D1 = T^{-1} * Dhat * T, + * D2 = T^{-1} * Dhat^2 * T. + * + * All matrices are n x n, stored in row-major order: M[i*n+j] + */ + + double *T_fwd = new double[n * n]; // Forward transform matrix + double *T_inv = new double[n * n]; // Inverse transform matrix + double *Dhat = new double[n * n]; // Spectral derivative operator + double *Dhat2 = new double[n * n]; // Spectral second derivative operator + double *tmp1 = new double[n * n]; // Temporary + double *tmp2 = new double[n * n]; // Temporary + + double Pion = Pi / n; + + // Build forward Chebyshev transform matrix T + // c_j = (2/n) * (-1)^j * sum_k f_k * cos(pi*j*(k+0.5)/n) + // So T[j][k] = (2/n) * (-1)^j * cos(pi*j*(k+0.5)/n) + for (int j = 0; j < n; j++) { + double fac = (2.0 / n) * ((j % 2 == 0) ? 1.0 : -1.0); + for (int k = 0; k < n; k++) { + T_fwd[j * n + k] = fac * cos(Pion * j * (k + 0.5)); + } + } + + // Build inverse Chebyshev transform matrix T^{-1} + // f_j = sum_k c_k * cos(pi*(j+0.5)*k/n) * (-1)^k - 0.5*c_0 + // But the -0.5*c_0 term is part of the sum when we write it as: + // f_j = -0.5*c_0 + sum_{k=0}^{n-1} c_k * cos(pi*(j+0.5)*k) * (-1)^k + // T_inv[j][k] = cos(pi*(j+0.5)*k/n) * (-1)^k, with k=0 term having extra -0.5 + for (int j = 0; j < n; j++) { + for (int k = 0; k < n; k++) { + double sign_k = (k % 2 == 0) ? 1.0 : -1.0; + T_inv[j * n + k] = cos(Pion * (j + 0.5) * k) * sign_k; + } + // The k=0 term needs adjustment: the sum includes c_0*1 but we need -0.5*c_0 + c_0*1 = 0.5*c_0 + // Wait, let me re-examine chebft_Zeros with inv=1: + // sum = -0.5 * u[0]; + // for k: sum += u[k] * cos(Pion*(j+0.5)*k) * isignum; isignum alternates starting from 1 + // So: c[j] = -0.5*u[0] + sum_{k=0}^{n-1} u[k]*cos(...)*(-1)^k + // = -0.5*u[0] + u[0]*1*1 + sum_{k=1} ... + // = 0.5*u[0] + sum_{k=1} u[k]*cos(...)*(-1)^k + // Equivalently: T_inv[j][0] = 0.5, T_inv[j][k] = cos(...)*(-1)^k for k>=1 + // But cos(0) = 1 and (-1)^0 = 1, so the formula gives T_inv[j][0] = 1.0 + // We need it to be 0.5. Fix: + T_inv[j * n + 0] = 0.5; // This accounts for the -0.5*u[0] + u[0]*cos(0)*1 = 0.5*u[0] + } + + // Build spectral derivative matrix Dhat (in coefficient space) + // The recurrence: cder[n-1] = 0, cder[n-2] = 0, + // cder[j] = cder[j+2] + 2*(j+1)*c[j+1] for j = n-3,...,0 + // This means cder = Dhat * c, where Dhat is upper triangular-ish. + // Dhat[j][k] = coefficient of c[k] contributing to cder[j] + // + // From the recurrence: cder[j] = sum_{k=j+1, k-j odd}^{n-1} 2*k * c[k] + // (with the factor 2k, summing over k > j where k-j is odd) + // Exception: cder[0] gets an extra factor of 0.5 since c[0] has the 2/n prefactor + // Actually no: the chder function is: + // cder[n] = cder[n-1] = 0 + // cder[j] = cder[j+2] + 2*(j+1)*c[j+1] + // Unrolling: cder[j] = 2*(j+1)*c[j+1] + 2*(j+3)*c[j+3] + ... + // So Dhat[j][k] = 2*k if k > j and (k-j) is odd, else 0 + + for (int j = 0; j < n; j++) + for (int k = 0; k < n; k++) + Dhat[j * n + k] = 0.0; + + for (int j = 0; j < n; j++) { + for (int k = j + 1; k < n; k++) { + if ((k - j) % 2 == 1) { + Dhat[j * n + k] = 2.0 * k; + } + } + } + + // Build Dhat^2 = Dhat * Dhat + // D1 = T_inv * Dhat * T_fwd + // D2 = T_inv * Dhat^2 * T_fwd + + // tmp1 = Dhat * T_fwd + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, Dhat, n, T_fwd, n, 0.0, tmp1, n); + // D1 = T_inv * tmp1 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D1, n); + + // tmp2 = Dhat * Dhat (Dhat^2 in spectral space) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, Dhat, n, Dhat, n, 0.0, tmp2, n); + // tmp1 = Dhat^2 * T_fwd + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, tmp2, n, T_fwd, n, 0.0, tmp1, n); + // D2 = T_inv * tmp1 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D2, n); + + delete[] T_fwd; + delete[] T_inv; + delete[] Dhat; + delete[] Dhat2; + delete[] tmp1; + delete[] tmp2; +} + +void TwoPunctures::build_fourier_deriv_matrices(int N, double *DF1, double *DF2) +{ + /* Build Fourier derivative matrices in physical space. + * + * Grid: phi_k = 2*pi*k/N, k=0,...,N-1 + * + * The Fourier interpolant derivative at grid points can be expressed as + * a matrix multiply. We build it by: + * 1. Forward Fourier transform matrix F + * 2. Frequency-domain derivative (multiply by il for first, -l^2 for second) + * 3. Inverse Fourier transform matrix F^{-1} + * DF1 = F^{-1} * diag(il) * F, DF2 = F^{-1} * diag(-l^2) * F + * + * But since fourft/fourev use a real representation (a_l, b_l), + * we construct directly in physical space. + */ + + int M = N / 2; + double Pi_fac = Pi / M; // = 2*Pi/N + + // DF1[j][k] = d/dphi of the interpolant at phi_j, due to value at phi_k + // Using the representation: + // f(phi) = 0.5*(a_0 + a_M*cos(M*phi)) + sum_{l=1}^{M-1} (a_l*cos(l*phi) + b_l*sin(l*phi)) + // where a_l = (2/N)*sum_k f_k*cos(l*phi_k), b_l = (2/N)*sum_k f_k*sin(l*phi_k) + // + // f'(phi) = -0.5*a_M*M*sin(M*phi) + sum_{l=1}^{M-1} l*(-a_l*sin(l*phi) + b_l*cos(l*phi)) + // + // Substituting a_l, b_l and evaluating at phi_j: + // f'(phi_j) = sum_k f_k * K(j,k) + // where K(j,k) = (2/N) * sum_{l=1}^{M-1} l * (-cos(l*phi_k)*sin(l*phi_j) + sin(l*phi_k)*cos(l*phi_j)) + // + (2/N) * (-M/2) * sin(M*phi_j) * cos(M*phi_k) [a_M term, note a_M has no factor 2] + // = (2/N) * sum_{l=1}^{M-1} l * sin(l*(phi_k - phi_j)) + // - (1/N) * M * sin(M*phi_j) * cos(M*phi_k) + // + // But the a_M coefficient in fourft has factor 1/M (not 2/M), so: + // Actually re-examining fourft: a[l] = fac * sum_k u[k]*cos(x), fac=1/M + // and a_M is stored as a[M] with same fac. The inverse uses: + // u[k] = 0.5*(a[0] + a[M]*iy) + sum_{l=1}^{M-1}(a[l]*cos + b[l]*sin) + // So the full expression needs care. Let me just compute it numerically. + + // Numerical approach: for each k, set f = delta_k, compute derivative at all j + double *p = new double[N]; + double *dp = new double[N]; + + for (int k = 0; k < N; k++) { + // Set delta function at k + for (int i = 0; i < N; i++) + p[i] = (i == k) ? 1.0 : 0.0; + + // Forward Fourier transform (using existing fourft) + fourft(p, N, 0); + // Derivative in spectral space + fourder(p, dp, N); + // Inverse Fourier transform + fourft(dp, N, 1); + + // dp[j] = derivative of delta_k interpolant at phi_j + // So DF1[j][k] = dp[j] + for (int j = 0; j < N; j++) + DF1[j * N + k] = dp[j]; + } + + // Second derivative + for (int k = 0; k < N; k++) { + for (int i = 0; i < N; i++) + p[i] = (i == k) ? 1.0 : 0.0; + + fourft(p, N, 0); + fourder2(p, dp, N); + fourft(dp, N, 1); + + for (int j = 0; j < N; j++) + DF2[j * N + k] = dp[j]; + } + + delete[] p; + delete[] dp; +} + +void TwoPunctures::precompute_derivative_matrices() +{ + int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi; + + // Allocate matrices + D1_A = new double[n1 * n1]; + D2_A = new double[n1 * n1]; + D1_B = new double[n2 * n2]; + D2_B = new double[n2 * n2]; + DF1_phi = new double[n3 * n3]; + DF2_phi = new double[n3 * n3]; + + // Build Chebyshev derivative matrices + build_cheb_deriv_matrices(n1, D1_A, D2_A); + build_cheb_deriv_matrices(n2, D1_B, D2_B); + + // Build Fourier derivative matrices + build_fourier_deriv_matrices(n3, DF1_phi, DF2_phi); + + printf("Precomputed derivative matrices: A(%d), B(%d), phi(%d)\n", n1, n2, n3); +} + +/* -------------------------------------------------------------------------- + * Derivatives_AB3_MatMul: Drop-in replacement for Derivatives_AB3 + * + * Uses precomputed derivative matrices and DGEMM to compute all spectral + * derivatives in batch. Mathematically equivalent to the original + * Derivatives_AB3. + * + * Memory layout of v.d0[Index(ivar,i,j,k)] = v.d0[ivar + nvar*(i + n1*(j + n2*k))] + * + * For A-direction derivatives (fixed j,k, varying i): + * We need to apply D1_A and D2_A to "pencils" along the i-direction. + * Collect all pencils into a matrix and use DGEMM. + * + * For B-direction derivatives (fixed i,k, varying j): + * Similarly with D1_B, D2_B. + * + * For phi-direction (fixed i,j, varying k): + * Similarly with DF1_phi, DF2_phi. + * --------------------------------------------------------------------------*/ +void TwoPunctures::Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v) +{ + int total_pencils; + double *data_in, *data_out; + + /*===================================================== + * STEP 1: A-direction derivatives (Chebyshev, D1_A, D2_A) + * + * For each (ivar, j, k), we have a pencil of length n1: + * f[i] = v.d0[Index(ivar, i, j, k, nvar, n1, n2, n3)] + * + * We want: v.d1[...] = D1_A * f, v.d11[...] = D2_A * f + * + * Collect all n2*n3*nvar pencils as columns of a matrix: + * data_in[i, col] where col = ivar + nvar*(j + n2*k) + * Then: data_out = D1_A * data_in (DGEMM: n1 x n1 times n1 x total_pencils) + *=====================================================*/ + total_pencils = nvar * n2 * n3; + + data_in = new double[n1 * total_pencils]; + data_out = new double[n1 * total_pencils]; + + // Gather: data_in[i * total_pencils + col] = v.d0[Index(ivar,i,j,k,...)] + // where col = ivar + nvar * (j + n2 * k) + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (j + n2 * k); + for (int i = 0; i < n1; i++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + data_in[i * total_pencils + col] = v.d0[indx]; + } + } + } + } + + // First derivative: data_out = D1_A * data_in + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n1, total_pencils, n1, + 1.0, D1_A, n1, data_in, total_pencils, + 0.0, data_out, total_pencils); + + // Scatter to v.d1 + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (j + n2 * k); + for (int i = 0; i < n1; i++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d1[indx] = data_out[i * total_pencils + col]; + } + } + } + } + + // Second derivative: data_out = D2_A * data_in + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n1, total_pencils, n1, + 1.0, D2_A, n1, data_in, total_pencils, + 0.0, data_out, total_pencils); + + // Scatter to v.d11 + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (j + n2 * k); + for (int i = 0; i < n1; i++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d11[indx] = data_out[i * total_pencils + col]; + } + } + } + } + + delete[] data_in; + delete[] data_out; + + /*===================================================== + * STEP 2: B-direction derivatives (Chebyshev, D1_B, D2_B) + * + * Pencils along j for each (ivar, i, k). + * Also compute mixed derivative v.d12 = D1_B applied to v.d1 + *=====================================================*/ + total_pencils = nvar * n1 * n3; + + data_in = new double[n2 * total_pencils]; + data_out = new double[n2 * total_pencils]; + double *data_in2 = new double[n2 * total_pencils]; + double *data_out2 = new double[n2 * total_pencils]; + + // Gather v.d0 along B-direction AND v.d1 for mixed derivative + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + data_in[j * total_pencils + col] = v.d0[indx]; + data_in2[j * total_pencils + col] = v.d1[indx]; // for d/dB of (dv/dA) + } + } + } + } + + // v.d2 = D1_B * v.d0 (along B) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n2, total_pencils, n2, + 1.0, D1_B, n2, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d2[indx] = data_out[j * total_pencils + col]; + } + } + } + } + + // v.d22 = D2_B * v.d0 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n2, total_pencils, n2, + 1.0, D2_B, n2, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d22[indx] = data_out[j * total_pencils + col]; + } + } + } + } + + // v.d12 = D1_B * v.d1 (mixed: d/dB of dv/dA) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n2, total_pencils, n2, + 1.0, D1_B, n2, data_in2, total_pencils, + 0.0, data_out2, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int k = 0; k < n3; k++) { + for (int i = 0; i < n1; i++) { + int col = ivar + nvar * (i + n1 * k); + for (int j = 0; j < n2; j++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d12[indx] = data_out2[j * total_pencils + col]; + } + } + } + } + + delete[] data_in; + delete[] data_out; + delete[] data_in2; + delete[] data_out2; + + /*===================================================== + * STEP 3: phi-direction derivatives (Fourier, DF1_phi, DF2_phi) + * + * Pencils along k for each (ivar, i, j). + * Also compute mixed derivatives v.d13, v.d23 + *=====================================================*/ + total_pencils = nvar * n1 * n2; + + data_in = new double[n3 * total_pencils]; + data_out = new double[n3 * total_pencils]; + data_in2 = new double[n3 * total_pencils]; // for v.d1 → v.d13 + data_out2 = new double[n3 * total_pencils]; + double *data_in3 = new double[n3 * total_pencils]; // for v.d2 → v.d23 + double *data_out3 = new double[n3 * total_pencils]; + + // Gather v.d0, v.d1, v.d2 along phi-direction + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + data_in[k * total_pencils + col] = v.d0[indx]; + data_in2[k * total_pencils + col] = v.d1[indx]; + data_in3[k * total_pencils + col] = v.d2[indx]; + } + } + } + } + + // v.d3 = DF1_phi * v.d0 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF1_phi, n3, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d3[indx] = data_out[k * total_pencils + col]; + } + } + } + } + + // v.d33 = DF2_phi * v.d0 + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF2_phi, n3, data_in, total_pencils, + 0.0, data_out, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d33[indx] = data_out[k * total_pencils + col]; + } + } + } + } + + // v.d13 = DF1_phi * v.d1 (mixed: d/dphi of dv/dA) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF1_phi, n3, data_in2, total_pencils, + 0.0, data_out2, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d13[indx] = data_out2[k * total_pencils + col]; + } + } + } + } + + // v.d23 = DF1_phi * v.d2 (mixed: d/dphi of dv/dB) + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, + n3, total_pencils, n3, + 1.0, DF1_phi, n3, data_in3, total_pencils, + 0.0, data_out3, total_pencils); + + for (int ivar = 0; ivar < nvar; ivar++) { + for (int i = 0; i < n1; i++) { + for (int j = 0; j < n2; j++) { + int col = ivar + nvar * (i + n1 * j); + for (int k = 0; k < n3; k++) { + int indx = Index(ivar, i, j, k, nvar, n1, n2, n3); + v.d23[indx] = data_out3[k * total_pencils + col]; + } + } + } + } + + delete[] data_in; + delete[] data_out; + delete[] data_in2; + delete[] data_out2; + delete[] data_in3; + delete[] data_out3; +} + diff --git a/AMSS_NCKU_source/TwoPunctures.h b/AMSS_NCKU_source/TwoPunctures.h index 22fb359..5f95797 100644 --- a/AMSS_NCKU_source/TwoPunctures.h +++ b/AMSS_NCKU_source/TwoPunctures.h @@ -1,7 +1,8 @@ - #ifndef TWO_PUNCTURES_H #define TWO_PUNCTURES_H +#include + #define StencilSize 19 #define N_PlaneRelax 1 #define NRELAX 200 @@ -32,7 +33,7 @@ private: int npoints_A, npoints_B, npoints_phi; double target_M_plus, target_M_minus; - + double admMass; double adm_tol; @@ -42,6 +43,18 @@ private: int ntotal; + // ===== Precomputed spectral derivative matrices ===== + double *D1_A, *D2_A; + double *D1_B, *D2_B; + double *DF1_phi, *DF2_phi; + + // ===== Pre-allocated workspace for LineRelax (per-thread) ===== + int max_threads; + double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be; + double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be; + double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al; + double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al; + struct parameters { int nvar, n1, n2, n3; @@ -58,6 +71,28 @@ public: int Newtonmaxit); ~TwoPunctures(); + // 02/07: New/modified methods + void allocate_workspace(); + void free_workspace(); + void precompute_derivative_matrices(); + void build_cheb_deriv_matrices(int n, double *D1, double *D2); + void build_fourier_deriv_matrices(int N, double *DF1, double *DF2); + void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v); + void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q, + double *l, double *u_ws, double *d, double *y); + void LineRelax_be_omp(double *dv, + int const i, int const k, int const nvar, + int const n1, int const n2, int const n3, + double const *rhs, int const *ncols, int **cols, + double **JFD, int tid); + void LineRelax_al_omp(double *dv, + int const j, int const k, int const nvar, + int const n1, int const n2, int const n3, + double const *rhs, int const *ncols, + int **cols, double **JFD, int tid); + void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3, + double const *rhs, int const *ncols, int **cols, double **JFD); + void Solve(); void set_initial_guess(derivs v); int index(int i, int j, int k, int l, int a, int b, int c, int d); @@ -116,23 +151,11 @@ public: double BY_KKofxyz(double x, double y, double z); void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix); void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u); - void relax(double *dv, int const nvar, int const n1, int const n2, int const n3, - double const *rhs, int const *ncols, int **cols, double **JFD); - void LineRelax_be(double *dv, - int const i, int const k, int const nvar, - int const n1, int const n2, int const n3, - double const *rhs, int const *ncols, int **cols, - double **JFD); void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2, int n3, derivs dv, derivs u, double *values); void LinEquations(double A, double B, double X, double R, double x, double r, double phi, double y, double z, derivs dU, derivs U, double *values); - void LineRelax_al(double *dv, - int const j, int const k, int const nvar, - int const n1, int const n2, int const n3, - double const *rhs, int const *ncols, - int **cols, double **JFD); void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q); void Save(char *fname); // provided by Vasileios Paschalidis (vpaschal@illinois.edu) @@ -141,4 +164,4 @@ public: void SpecCoef(parameters par, int ivar, double *v, double *cf); }; -#endif /* TWO_PUNCTURES_H */ +#endif /* TWO_PUNCTURES_H */ \ No newline at end of file diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc index 489bbce..c25fcf1 100755 --- a/AMSS_NCKU_source/makefile.inc +++ b/AMSS_NCKU_source/makefile.inc @@ -15,10 +15,9 @@ LDLIBS = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible) ## -fp-model fast=2: Aggressive floating-point optimizations ## -fma: Enable fused multiply-add instructions -## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues -CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \ +CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \ -Dfortran3 -Dnewc -I${MKLROOT}/include -f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \ +f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \ -align array64byte -fpp -I${MKLROOT}/include f90 = ifx f77 = ifx