From f345b0e520402cc5e97a0a102560573353e29fb3 Mon Sep 17 00:00:00 2001
From: ianchb <i@4t.pw>
Date: Sat, 7 Feb 2026 14:46:46 +0800
Subject: [PATCH] Performance optimization for the TwoPunctures module *
 Re-enabled OpenMP.

1. Batch spectral derivatives (Chebyshev & Fourier) via precomputed matrices:
Chebyshev/Fourier transforms and derivatives are precomputed as explicit physical-space operator matrices.
Batch DGEMM now applies to entire tensor fields, mathematically identical to original per-line transforms but vastly faster.

2. Gauss-Seidel relaxation & tridiagonal solver workspace reuse:
Per-thread reusable workspaces replace per-call heap new/delete in all tridiagonal and relaxation routines.

3. Efficient OpenMP multithreading throughout relaxation/deriv:
relax_omp and friends parallelize over grouped lines/planes, maximizing threading efficiency and memory independence.

Co-authored-by: copilot-swe-agent[bot] <198982749+copilot@users.noreply.github.com>
---
 AMSS_NCKU_source/TwoPunctures.C | 1074 +++++++++++++++++++++++++------
 AMSS_NCKU_source/TwoPunctures.h |   53 +-
 AMSS_NCKU_source/makefile.inc   |    5 +-
 3 files changed, 917 insertions(+), 215 deletions(-)

diff --git a/AMSS_NCKU_source/TwoPunctures.C b/AMSS_NCKU_source/TwoPunctures.C
index a5b0c85..ea84474 100644
--- a/AMSS_NCKU_source/TwoPunctures.C
+++ b/AMSS_NCKU_source/TwoPunctures.C
@@ -60,6 +60,10 @@ TwoPunctures::TwoPunctures(double mp, double mm, double b,
   F = dvector(0, ntotal - 1);
   allocate_derivs(&u, ntotal);
   allocate_derivs(&v, ntotal);
+  D1_A = NULL; D2_A = NULL; D1_B = NULL; D2_B = NULL;
+  DF1_phi = NULL; DF2_phi = NULL;
+  precompute_derivative_matrices();
+  allocate_workspace();
 }
 
 TwoPunctures::~TwoPunctures()
@@ -67,6 +71,13 @@ TwoPunctures::~TwoPunctures()
   free_dvector(F, 0, ntotal - 1);
   free_derivs(&u, ntotal);
   free_derivs(&v, ntotal);
+  free_workspace();
+  if (D1_A)    delete[] D1_A;
+  if (D2_A)    delete[] D2_A;
+  if (D1_B)    delete[] D1_B;
+  if (D2_B)    delete[] D2_B;
+  if (DF1_phi) delete[] DF1_phi;
+  if (DF2_phi) delete[] DF2_phi;
 }
 
 void TwoPunctures::Solve()
@@ -303,7 +314,7 @@ void TwoPunctures::set_initial_guess(derivs v)
           v.d0[indx] = 0;                                     // set initial guess 0
           v.d0[indx] /= (-cos(Pih * (2 * i + 1) / n1) - 1.0); // PRD 70, 064011 (2004) Eq.(5), from u to U
         }
-  Derivatives_AB3(nvar, n1, n2, n3, v);
+  Derivatives_AB3_MatMul(nvar, n1, n2, n3, v);
   if (0)
   {
     debug_file = fopen("initial.dat", "w");
@@ -1284,9 +1295,6 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
   derivs U;
   double *sources;
 
-  values = dvector(0, nvar - 1);
-  allocate_derivs(&U, nvar);
-
   sources = (double *)calloc(n1 * n2 * n3, sizeof(double));
   if (0)
   {
@@ -1343,7 +1351,7 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
         for (k = 0; k < n3; k++)
           sources[Index(0, i, j, k, 1, n1, n2, n3)] = 0.0;
 
-  Derivatives_AB3(nvar, n1, n2, n3, v);
+  Derivatives_AB3_MatMul(nvar, n1, n2, n3, v);
   double psi, psi2, psi4, psi7, r_plus, r_minus;
   FILE *debugfile = NULL;
   if (0)
@@ -1351,12 +1359,22 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
     debugfile = fopen("res.dat", "w");
     assert(debugfile);
   }
+  #pragma omp parallel for collapse(3) schedule(static) \
+    private(i, j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1, \
+            psi, psi2, psi4, psi7, r_plus, r_minus)
   for (i = 0; i < n1; i++)
   {
     for (j = 0; j < n2; j++)
     {
       for (k = 0; k < n3; k++)
       {
+        double l_values[1];  // nvar=1, stack-allocated
+        derivs l_U;
+        double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1];
+        double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1];
+        l_U.d0 = l_U_d0; l_U.d1 = l_U_d1; l_U.d2 = l_U_d2; l_U.d3 = l_U_d3;
+        l_U.d11 = l_U_d11; l_U.d12 = l_U_d12; l_U.d13 = l_U_d13;
+        l_U.d22 = l_U_d22; l_U.d23 = l_U_d23; l_U.d33 = l_U_d33;
 
         al = Pih * (2 * i + 1) / n1;
         A = -cos(al);
@@ -1368,72 +1386,36 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
         for (ivar = 0; ivar < nvar; ivar++)
         {
           indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
-          U.d0[ivar] = Am1 * v.d0[indx];                    /* U*/
-          U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx];       /* U_A*/
-          U.d2[ivar] = Am1 * v.d2[indx];                    /* U_B*/
-          U.d3[ivar] = Am1 * v.d3[indx];                    /* U_3*/
-          U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx]; /* U_AA*/
-          U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx];     /* U_AB*/
-          U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx];     /* U_AB*/
-          U.d22[ivar] = Am1 * v.d22[indx];                  /* U_BB*/
-          U.d23[ivar] = Am1 * v.d23[indx];                  /* U_B3*/
-          U.d33[ivar] = Am1 * v.d33[indx];                  /* U_33*/
+          l_U.d0[ivar] = Am1 * v.d0[indx];
+          l_U.d1[ivar] = v.d0[indx] + Am1 * v.d1[indx];
+          l_U.d2[ivar] = Am1 * v.d2[indx];
+          l_U.d3[ivar] = Am1 * v.d3[indx];
+          l_U.d11[ivar] = 2 * v.d1[indx] + Am1 * v.d11[indx];
+          l_U.d12[ivar] = v.d2[indx] + Am1 * v.d12[indx];
+          l_U.d13[ivar] = v.d3[indx] + Am1 * v.d13[indx];
+          l_U.d22[ivar] = Am1 * v.d22[indx];
+          l_U.d23[ivar] = Am1 * v.d23[indx];
+          l_U.d33[ivar] = Am1 * v.d33[indx];
         }
-        /* Calculation of (X,R) and*/
-        /* (U_X, U_R, U_3, U_XX, U_XR, U_X3, U_RR, U_R3, U_33)*/
-        AB_To_XR(nvar, A, B, &X, &R, U);
-        /* Calculation of (x,r) and*/
-        /* (U, U_x, U_r, U_3, U_xx, U_xr, U_x3, U_rr, U_r3, U_33)*/
-        C_To_c(nvar, X, R, &x, &r, U);
-        /* Calculation of (y,z) and*/
-        /* (U, U_x, U_y, U_z, U_xx, U_xy, U_xz, U_yy, U_yz, U_zz)*/
-        rx3_To_xyz(nvar, x, r, phi, &y, &z, U);
+        AB_To_XR(nvar, A, B, &X, &R, l_U);
+        C_To_c(nvar, X, R, &x, &r, l_U);
+        rx3_To_xyz(nvar, x, r, phi, &y, &z, l_U);
         NonLinEquations(sources[Index(0, i, j, k, 1, n1, n2, n3)],
-                        A, B, X, R, x, r, phi, y, z, U, values);
+                        A, B, X, R, x, r, phi, y, z, l_U, l_values);
         for (ivar = 0; ivar < nvar; ivar++)
         {
           indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
-          F[indx] = values[ivar] * FAC;
-          /* if ((i<5) && ((j<5) || (j>n2-5)))*/
-          /*     F[indx] = 0.0;*/
-          u.d0[indx] = U.d0[ivar];   /*  U*/
-          u.d1[indx] = U.d1[ivar];   /*      U_x*/
-          u.d2[indx] = U.d2[ivar];   /*      U_y*/
-          u.d3[indx] = U.d3[ivar];   /*      U_z*/
-          u.d11[indx] = U.d11[ivar]; /*      U_xx*/
-          u.d12[indx] = U.d12[ivar]; /*      U_xy*/
-          u.d13[indx] = U.d13[ivar]; /*      U_xz*/
-          u.d22[indx] = U.d22[ivar]; /*      U_yy*/
-          u.d23[indx] = U.d23[ivar]; /*      U_yz*/
-          u.d33[indx] = U.d33[ivar]; /*      U_zz*/
-        }
-        if (debugfile && (k == 0))
-        {
-          r_plus = sqrt((x - par_b) * (x - par_b) + y * y + z * z);
-          r_minus = sqrt((x + par_b) * (x + par_b) + y * y + z * z);
-          psi = 1. +
-                0.5 * par_m_plus / r_plus +
-                0.5 * par_m_minus / r_minus +
-                U.d0[0];
-          psi2 = psi * psi;
-          psi4 = psi2 * psi2;
-          psi7 = psi * psi2 * psi4;
-          fprintf(debugfile,
-                  "%.16g %.16g %.16g %.16g %.16g %.16g %.16g %.16g\n",
-                  (double)x, (double)y, (double)A, (double)B,
-                  (double)(U.d11[0] +
-                           U.d22[0] +
-                           U.d33[0] +
-                           /*                      0.125 * BY_KKofxyz (x, y, z) / psi7 +*/
-                           (2.0 * Pi / psi2 / psi * sources[indx]) * FAC),
-                  (double)((U.d11[0] +
-                            U.d22[0] +
-                            U.d33[0]) *
-                           FAC),
-                  (double)(-(2.0 * Pi / psi2 / psi * sources[indx]) * FAC),
-                  (double)sources[indx]
-                  /*(double)F[indx]*/
-          );
+          F[indx] = l_values[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be);
+          u.d0[indx] = l_U.d0[ivar];
+          u.d1[indx] = l_U.d1[ivar];
+          u.d2[indx] = l_U.d2[ivar];
+          u.d3[indx] = l_U.d3[ivar];
+          u.d11[indx] = l_U.d11[ivar];
+          u.d12[indx] = l_U.d12[ivar];
+          u.d13[indx] = l_U.d13[ivar];
+          u.d22[indx] = l_U.d22[ivar];
+          u.d23[indx] = l_U.d23[ivar];
+          u.d33[indx] = l_U.d33[ivar];
         }
       }
     }
@@ -1443,8 +1425,6 @@ void TwoPunctures::F_of_v(int nvar, int n1, int n2, int n3, derivs v, double *F,
     fclose(debugfile);
   }
   free(sources);
-  free_dvector(values, 0, nvar - 1);
-  free_derivs(&U, nvar);
 }
 /* --------------------------------------------------------------------------*/
 double TwoPunctures::norm_inf(double const *F, int const ntotal)
@@ -1544,7 +1524,7 @@ int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const
     for (int j = 0; j < ntotal; j++)
       ph.d0[j] = 0;
     for (int j = 0; j < NRELAX; j++) /* solves JFD*ph = p by relaxation*/
-      relax(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD);
+      relax_omp(ph.d0, nvar, n1, n2, n3, p, ncols, cols, JFD);
 
     J_times_dv(nvar, n1, n2, n3, ph, vv, u); /* vv=J*ph*/
     alpha = rho / scalarproduct(rt, vv, ntotal);
@@ -1570,7 +1550,7 @@ int TwoPunctures::bicgstab(int const nvar, int const n1, int const n2, int const
     for (int j = 0; j < ntotal; j++)
       sh.d0[j] = 0;
     for (int j = 0; j < NRELAX; j++) /* solves JFD*sh = s by relaxation*/
-      relax(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD);
+      relax_omp(sh.d0, nvar, n1, n2, n3, s, ncols, cols, JFD);
 
     J_times_dv(nvar, n1, n2, n3, sh, t, u); /* t=J*sh*/
     omega = scalarproduct(t, s, ntotal) / scalarproduct(t, t, ntotal);
@@ -1845,21 +1825,32 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
   /*      (u.d1[], u.d2[], u.d3[], u.d11[], u.d12[], u.d13[], u.d22[], u.d23[], u.d33[])*/
   /*      at interior points and at the boundaries "+/-"*/
   int i, j, k, ivar, indx;
-  double al, be, A, B, X, R, x, r, phi, y, z, Am1, *values;
-  derivs dU, U;
+  double al, be, A, B, X, R, x, r, phi, y, z, Am1;
 
-  Derivatives_AB3(nvar, n1, n2, n3, dv);
+  Derivatives_AB3_MatMul(nvar, n1, n2, n3, dv);
 
+  #pragma omp parallel for schedule(static) \
+    private(j, k, ivar, indx, al, be, A, B, X, R, x, r, phi, y, z, Am1)
   for (i = 0; i < n1; i++)
   {
-    values = dvector(0, nvar - 1);
-    allocate_derivs(&dU, nvar);
-    allocate_derivs(&U, nvar);
+    // Thread-local derivs on stack (nvar=1)
+    double l_val[1];
+    double l_dU_d0[1], l_dU_d1[1], l_dU_d2[1], l_dU_d3[1];
+    double l_dU_d11[1], l_dU_d12[1], l_dU_d13[1], l_dU_d22[1], l_dU_d23[1], l_dU_d33[1];
+    double l_U_d0[1], l_U_d1[1], l_U_d2[1], l_U_d3[1];
+    double l_U_d11[1], l_U_d12[1], l_U_d13[1], l_U_d22[1], l_U_d23[1], l_U_d33[1];
+    derivs l_dU, l_U;
+    l_dU.d0=l_dU_d0; l_dU.d1=l_dU_d1; l_dU.d2=l_dU_d2; l_dU.d3=l_dU_d3;
+    l_dU.d11=l_dU_d11; l_dU.d12=l_dU_d12; l_dU.d13=l_dU_d13;
+    l_dU.d22=l_dU_d22; l_dU.d23=l_dU_d23; l_dU.d33=l_dU_d33;
+    l_U.d0=l_U_d0; l_U.d1=l_U_d1; l_U.d2=l_U_d2; l_U.d3=l_U_d3;
+    l_U.d11=l_U_d11; l_U.d12=l_U_d12; l_U.d13=l_U_d13;
+    l_U.d22=l_U_d22; l_U.d23=l_U_d23; l_U.d33=l_U_d33;
+
     for (j = 0; j < n2; j++)
     {
       for (k = 0; k < n3; k++)
       {
-
         al = Pih * (2 * i + 1) / n1;
         A = -cos(al);
         be = Pih * (2 * j + 1) / n2;
@@ -1870,104 +1861,193 @@ void TwoPunctures::J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, doubl
         for (ivar = 0; ivar < nvar; ivar++)
         {
           indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
-          dU.d0[ivar] = Am1 * dv.d0[indx];                     /* dU*/
-          dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx];       /* dU_A*/
-          dU.d2[ivar] = Am1 * dv.d2[indx];                     /* dU_B*/
-          dU.d3[ivar] = Am1 * dv.d3[indx];                     /* dU_3*/
-          dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx]; /* dU_AA*/
-          dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx];     /* dU_AB*/
-          dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx];     /* dU_AB*/
-          dU.d22[ivar] = Am1 * dv.d22[indx];                   /* dU_BB*/
-          dU.d23[ivar] = Am1 * dv.d23[indx];                   /* dU_B3*/
-          dU.d33[ivar] = Am1 * dv.d33[indx];                   /* dU_33*/
-          U.d0[ivar] = u.d0[indx];                             /* U   */
-          U.d1[ivar] = u.d1[indx];                             /* U_x*/
-          U.d2[ivar] = u.d2[indx];                             /* U_y*/
-          U.d3[ivar] = u.d3[indx];                             /* U_z*/
-          U.d11[ivar] = u.d11[indx];                           /* U_xx*/
-          U.d12[ivar] = u.d12[indx];                           /* U_xy*/
-          U.d13[ivar] = u.d13[indx];                           /* U_xz*/
-          U.d22[ivar] = u.d22[indx];                           /* U_yy*/
-          U.d23[ivar] = u.d23[indx];                           /* U_yz*/
-          U.d33[ivar] = u.d33[indx];                           /* U_zz*/
+          l_dU.d0[ivar] = Am1 * dv.d0[indx];
+          l_dU.d1[ivar] = dv.d0[indx] + Am1 * dv.d1[indx];
+          l_dU.d2[ivar] = Am1 * dv.d2[indx];
+          l_dU.d3[ivar] = Am1 * dv.d3[indx];
+          l_dU.d11[ivar] = 2 * dv.d1[indx] + Am1 * dv.d11[indx];
+          l_dU.d12[ivar] = dv.d2[indx] + Am1 * dv.d12[indx];
+          l_dU.d13[ivar] = dv.d3[indx] + Am1 * dv.d13[indx];
+          l_dU.d22[ivar] = Am1 * dv.d22[indx];
+          l_dU.d23[ivar] = Am1 * dv.d23[indx];
+          l_dU.d33[ivar] = Am1 * dv.d33[indx];
+          l_U.d0[ivar] = u.d0[indx];
+          l_U.d1[ivar] = u.d1[indx];
+          l_U.d2[ivar] = u.d2[indx];
+          l_U.d3[ivar] = u.d3[indx];
+          l_U.d11[ivar] = u.d11[indx];
+          l_U.d12[ivar] = u.d12[indx];
+          l_U.d13[ivar] = u.d13[indx];
+          l_U.d22[ivar] = u.d22[indx];
+          l_U.d23[ivar] = u.d23[indx];
+          l_U.d33[ivar] = u.d33[indx];
         }
-        /* Calculation of (X,R) and*/
-        /* (dU_X, dU_R, dU_3, dU_XX, dU_XR, dU_X3, dU_RR, dU_R3, dU_33)*/
-        AB_To_XR(nvar, A, B, &X, &R, dU);
-        /* Calculation of (x,r) and*/
-        /* (dU, dU_x, dU_r, dU_3, dU_xx, dU_xr, dU_x3, dU_rr, dU_r3, dU_33)*/
-        C_To_c(nvar, X, R, &x, &r, dU);
-        /* Calculation of (y,z) and*/
-        /* (dU, dU_x, dU_y, dU_z, dU_xx, dU_xy, dU_xz, dU_yy, dU_yz, dU_zz)*/
-        rx3_To_xyz(nvar, x, r, phi, &y, &z, dU);
-        LinEquations(A, B, X, R, x, r, phi, y, z, dU, U, values);
+        AB_To_XR(nvar, A, B, &X, &R, l_dU);
+        C_To_c(nvar, X, R, &x, &r, l_dU);
+        rx3_To_xyz(nvar, x, r, phi, &y, &z, l_dU);
+        LinEquations(A, B, X, R, x, r, phi, y, z, l_dU, l_U, l_val);
         for (ivar = 0; ivar < nvar; ivar++)
         {
           indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
-          Jdv[indx] = values[ivar] * FAC;
+          Jdv[indx] = l_val[ivar] * sin(al) * sin(be) * sin(al) * sin(be) * sin(al) * sin(be);
         }
       }
     }
-    free_dvector(values, 0, nvar - 1);
-    free_derivs(&dU, nvar);
-    free_derivs(&U, nvar);
   }
 }
 /* --------------------------------------------------------------------------*/
-void TwoPunctures::relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
+/* --------------------------------------------------------------------------
+ * relax_omp: OpenMP-parallelized replacement for relax()
+ *
+ * Parallelism analysis:
+ *   - The red-black ordering within each phi-plane means that
+ *     same-parity lines in the i-direction are INDEPENDENT of each other
+ *     (they only couple through the j-direction which is solved internally).
+ *   - Similarly, same-parity lines in the j-direction are independent.
+ *   - Different phi-planes (k) with same parity are independent.
+ *
+ * Strategy:
+ *   - Parallelize the i-loop within each (k, parity) group for LineRelax_be
+ *   - Parallelize the j-loop within each (k, parity) group for LineRelax_al
+ *   - Each thread uses its own pre-allocated workspace (tid-indexed)
+ * --------------------------------------------------------------------------*/
+void TwoPunctures::relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
                          double const *rhs, int const *ncols, int **cols, double **JFD)
 {
-  int i, j, k, n;
+    int n;
 
-  for (k = 0; k < n3; k = k + 2)
-  {
+    // 偶数k平面
     for (n = 0; n < N_PlaneRelax; n++)
     {
-      for (i = 2; i < n1; i = i + 2)
-        LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-      for (i = 1; i < n1; i = i + 2)
-        LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-      for (j = 1; j < n2; j = j + 2)
-        LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-      for (j = 0; j < n2; j = j + 2)
-        LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
+        // 偶数i线，所有偶数k —— 不同k平面完全独立
+        int n_even_k = (n3 + 1) / 2;  // 偶数k的数量
+        int n_even_i = (n1 - 2 + 1) / 2;  // i=2,4,...的数量
+        int total_tasks = n_even_k * n_even_i;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_even_i;
+            int ii = task % n_even_i;
+            int k = ki * 2;
+            int i = 2 + ii * 2;
+            LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 奇数i线，所有偶数k
+        int n_odd_i = n1 / 2;  // i=1,3,...的数量
+        total_tasks = n_even_k * n_odd_i;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_odd_i;
+            int ii = task % n_odd_i;
+            int k = ki * 2;
+            int i = 1 + ii * 2;
+            LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 奇数j线，所有偶数k
+        int n_odd_j = (n2 - 1 + 1) / 2;
+        total_tasks = n_even_k * n_odd_j;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_odd_j;
+            int ji = task % n_odd_j;
+            int k = ki * 2;
+            int j = 1 + ji * 2;
+            LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 偶数j线，所有偶数k
+        int n_even_j = (n2 + 1) / 2;
+        total_tasks = n_even_k * n_even_j;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_even_j;
+            int ji = task % n_even_j;
+            int k = ki * 2;
+            int j = ji * 2;
+            LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 奇数k平面 — 同样的四步
+        int n_odd_k = n3 / 2;
+
+        // 偶数i线，所有奇数k
+        n_even_i = (n1 + 1) / 2;  // i=0,2,...
+        total_tasks = n_odd_k * n_even_i;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_even_i;
+            int ii = task % n_even_i;
+            int k = 1 + ki * 2;
+            int i = ii * 2;
+            LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 奇数i线，所有奇数k
+        total_tasks = n_odd_k * n_odd_i;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_odd_i;
+            int ii = task % n_odd_i;
+            int k = 1 + ki * 2;
+            int i = 1 + ii * 2;
+            LineRelax_be_omp(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 奇数j线，所有奇数k
+        total_tasks = n_odd_k * n_odd_j;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_odd_j;
+            int ji = task % n_odd_j;
+            int k = 1 + ki * 2;
+            int j = 1 + ji * 2;
+            LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
+
+        // 偶数j线，所有奇数k
+        total_tasks = n_odd_k * n_even_j;
+
+        #pragma omp parallel for schedule(static)
+        for (int task = 0; task < total_tasks; task++) {
+            int tid = omp_get_thread_num();
+            int ki = task / n_even_j;
+            int ji = task % n_even_j;
+            int k = 1 + ki * 2;
+            int j = ji * 2;
+            LineRelax_al_omp(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD, tid);
+        }
     }
-  }
-  for (k = 1; k < n3; k = k + 2)
-  {
-    for (n = 0; n < N_PlaneRelax; n++)
-    {
-      for (i = 0; i < n1; i = i + 2)
-        LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-      for (i = 1; i < n1; i = i + 2)
-        LineRelax_be(dv, i, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-      for (j = 1; j < n2; j = j + 2)
-        LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-      for (j = 0; j < n2; j = j + 2)
-        LineRelax_al(dv, j, k, nvar, n1, n2, n3, rhs, ncols, cols, JFD);
-    }
-  }
 }
 /* --------------------------------------------------------------------------*/
-void TwoPunctures::LineRelax_be(double *dv,
+void TwoPunctures::LineRelax_be_omp(double *dv,
                                 int const i, int const k, int const nvar,
                                 int const n1, int const n2, int const n3,
                                 double const *rhs, int const *ncols, int **cols,
-                                double **JFD)
+                                double **JFD, int tid)
 {
   int j, m, Ic, Ip, Im, col, ivar;
 
-  double *diag = new double[n2];
-  double *e = new double[n2 - 1]; /* above diagonal */
-  double *f = new double[n2 - 1]; /* below diagonal */
-  double *b = new double[n2];     /* rhs */
-  double *x = new double[n2];     /* solution vector */
-
-  //  gsl_vector *diag = gsl_vector_alloc(n2);
-  //  gsl_vector *e = gsl_vector_alloc(n2-1); /* above diagonal */
-  //  gsl_vector *f = gsl_vector_alloc(n2-1); /* below diagonal */
-  //  gsl_vector *b = gsl_vector_alloc(n2);   /* rhs */
-  //  gsl_vector *x = gsl_vector_alloc(n2);   /* solution vector */
+  // Use pre-allocated per-thread workspace instead of new/delete
+  double *diag = ws_diag_be[tid];
+  double *e    = ws_e_be[tid];
+  double *f    = ws_f_be[tid];
+  double *b    = ws_b_be[tid];
+  double *x    = ws_x_be[tid];
 
   for (ivar = 0; ivar < nvar; ivar++)
   {
@@ -2007,14 +2087,8 @@ void TwoPunctures::LineRelax_be(double *dv,
         }
       }
     }
-    //          A x = b
-    //          A = ( d_0 e_0  0   0  )
-    //              ( f_0 d_1 e_1  0  )
-    //              (  0  f_1 d_2 e_2 )
-    //              (  0   0  f_2 d_3 )
-    //
-    ThomasAlgorithm(n2, f, diag, e, x, b);
-    //    gsl_linalg_solve_tridiag(diag, e, f, b, x);
+    ThomasAlgorithm_ws(n2, f, diag, e, x, b,
+                       ws_l_be[tid], ws_u_be[tid], ws_d_be[tid], ws_y_be[tid]);
     for (j = 0; j < n2; j++)
     {
       Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
@@ -2022,17 +2096,7 @@ void TwoPunctures::LineRelax_be(double *dv,
       //      dv[Ic] = gsl_vector_get(x, j);
     }
   }
-
-  delete[] diag;
-  delete[] e;
-  delete[] f;
-  delete[] b;
-  delete[] x;
-  //  gsl_vector_free(diag);
-  //  gsl_vector_free(e);
-  //  gsl_vector_free(f);
-  //  gsl_vector_free(b);
-  //  gsl_vector_free(x);
+  // No delete — workspace is persistent
 }
 /* --------------------------------------------------------------------------*/
 void TwoPunctures::JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
@@ -2194,25 +2258,19 @@ void TwoPunctures::LinEquations(double A, double B, double X, double R,
   values[0] = dU.d11[0] + dU.d22[0] + dU.d33[0] - 0.875 * BY_KKofxyz(x, y, z) / psi8 * dU.d0[0];
 }
 /* -------------------------------------------------------------------------*/
-void TwoPunctures::LineRelax_al(double *dv,
+void TwoPunctures::LineRelax_al_omp(double *dv,
                                 int const j, int const k, int const nvar,
                                 int const n1, int const n2, int const n3,
                                 double const *rhs, int const *ncols,
-                                int **cols, double **JFD)
+                                int **cols, double **JFD, int tid)
 {
   int i, m, Ic, Ip, Im, col, ivar;
 
-  double *diag = new double[n1];
-  double *e = new double[n1 - 1]; /* above diagonal */
-  double *f = new double[n1 - 1]; /* below diagonal */
-  double *b = new double[n1];     /* rhs */
-  double *x = new double[n1];     /* solution vector */
-
-  //  gsl_vector *diag = gsl_vector_alloc(n1);
-  //  gsl_vector *e = gsl_vector_alloc(n1-1); /* above diagonal */
-  //  gsl_vector *f = gsl_vector_alloc(n1-1); /* below diagonal */
-  //  gsl_vector *b = gsl_vector_alloc(n1);   /* rhs */
-  //  gsl_vector *x = gsl_vector_alloc(n1);   /* solution vector */
+  double *diag = ws_diag_al[tid];
+  double *e    = ws_e_al[tid];
+  double *f    = ws_f_al[tid];
+  double *b    = ws_b_al[tid];
+  double *x    = ws_x_al[tid];
 
   for (ivar = 0; ivar < nvar; ivar++)
   {
@@ -2252,8 +2310,8 @@ void TwoPunctures::LineRelax_al(double *dv,
         }
       }
     }
-    ThomasAlgorithm(n1, f, diag, e, x, b);
-    //    gsl_linalg_solve_tridiag(diag, e, f, b, x);
+    ThomasAlgorithm_ws(n1, f, diag, e, x, b,
+                       ws_l_al[tid], ws_u_al[tid], ws_d_al[tid], ws_y_al[tid]);
     for (i = 0; i < n1; i++)
     {
       Ic = Index(ivar, i, j, k, nvar, n1, n2, n3);
@@ -2261,18 +2319,6 @@ void TwoPunctures::LineRelax_al(double *dv,
       //      dv[Ic] = gsl_vector_get(x, i);
     }
   }
-
-  delete[] diag;
-  delete[] e;
-  delete[] f;
-  delete[] b;
-  delete[] x;
-
-  //  gsl_vector_free(diag);
-  //  gsl_vector_free(e);
-  //  gsl_vector_free(f);
-  //  gsl_vector_free(b);
-  //  gsl_vector_free(x);
 }
 /* -------------------------------------------------------------------------*/
 // a[N], b[N-1], c[N-1], x[N], q[N]
@@ -2323,6 +2369,37 @@ void TwoPunctures::ThomasAlgorithm(int N, double *b, double *a, double *c, doubl
 
   return;
 }
+
+// ThomasAlgorithm with pre-allocated workspace (no new/delete)
+// l[N-1], u_ws[N-1], d[N], y[N] are caller-provided workspace
+void TwoPunctures::ThomasAlgorithm_ws(int N, double *b, double *a, double *c,
+                                       double *x, double *q,
+                                       double *l, double *u_ws, double *d, double *y)
+{
+  /* LU Decomposition */
+  d[0] = a[0];
+  u_ws[0] = c[0];
+
+  for (int i = 0; i < N - 2; i++) {
+    l[i] = b[i] / d[i];
+    d[i + 1] = a[i + 1] - l[i] * u_ws[i];
+    u_ws[i + 1] = c[i + 1];
+  }
+
+  l[N - 2] = b[N - 2] / d[N - 2];
+  d[N - 1] = a[N - 1] - l[N - 2] * u_ws[N - 2];
+
+  /* Forward Substitution [L][y] = [q] */
+  y[0] = q[0];
+  for (int i = 1; i < N; i++)
+    y[i] = q[i] - l[i - 1] * y[i - 1];
+
+  /* Backward Substitution [U][x] = [y] */
+  x[N - 1] = y[N - 1] / d[N - 1];
+  for (int i = N - 2; i >= 0; i--)
+    x[i] = (y[i] - u_ws[i] * x[i + 1]) / d[i];
+}
+
 // --------------------------------------------------------------------------*/
 // Calculates the value of v at an arbitrary position (x,y,z) if the spectral coefficients are know*/*/
 /* --------------------------------------------------------------------------*/
@@ -2512,3 +2589,606 @@ void TwoPunctures::SpecCoef(parameters par, int ivar, double *v, double *cf)
   free_d3tensor(values3, 0, n1, 0, n2, 0, n3);
   free_d3tensor(values4, 0, n1, 0, n2, 0, n3);
 }
+
+void TwoPunctures::allocate_workspace()
+{
+    int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
+    max_threads = omp_get_max_threads();
+    printf("Allocating workspace for %d threads\n", max_threads);
+
+    // LineRelax_be workspace: arrays of size n2, per thread
+    ws_diag_be = new double*[max_threads];
+    ws_e_be    = new double*[max_threads];
+    ws_f_be    = new double*[max_threads];
+    ws_b_be    = new double*[max_threads];
+    ws_x_be    = new double*[max_threads];
+    ws_l_be    = new double*[max_threads];
+    ws_u_be    = new double*[max_threads];
+    ws_d_be    = new double*[max_threads];
+    ws_y_be    = new double*[max_threads];
+
+    // LineRelax_al workspace: arrays of size n1, per thread
+    ws_diag_al = new double*[max_threads];
+    ws_e_al    = new double*[max_threads];
+    ws_f_al    = new double*[max_threads];
+    ws_b_al    = new double*[max_threads];
+    ws_x_al    = new double*[max_threads];
+    ws_l_al    = new double*[max_threads];
+    ws_u_al    = new double*[max_threads];
+    ws_d_al    = new double*[max_threads];
+    ws_y_al    = new double*[max_threads];
+
+    int N = (n1 > n2) ? n1 : n2;  // max of n1, n2
+
+    for (int t = 0; t < max_threads; t++) {
+        ws_diag_be[t] = new double[n2];
+        ws_e_be[t]    = new double[n2];
+        ws_f_be[t]    = new double[n2];
+        ws_b_be[t]    = new double[n2];
+        ws_x_be[t]    = new double[n2];
+        ws_l_be[t]    = new double[n2];
+        ws_u_be[t]    = new double[n2];
+        ws_d_be[t]    = new double[n2];
+        ws_y_be[t]    = new double[n2];
+
+        ws_diag_al[t] = new double[n1];
+        ws_e_al[t]    = new double[n1];
+        ws_f_al[t]    = new double[n1];
+        ws_b_al[t]    = new double[n1];
+        ws_x_al[t]    = new double[n1];
+        ws_l_al[t]    = new double[n1];
+        ws_u_al[t]    = new double[n1];
+        ws_d_al[t]    = new double[n1];
+        ws_y_al[t]    = new double[n1];
+    }
+}
+
+void TwoPunctures::free_workspace()
+{
+    for (int t = 0; t < max_threads; t++) {
+        delete[] ws_diag_be[t]; delete[] ws_e_be[t]; delete[] ws_f_be[t];
+        delete[] ws_b_be[t];    delete[] ws_x_be[t];
+        delete[] ws_l_be[t];    delete[] ws_u_be[t];
+        delete[] ws_d_be[t];    delete[] ws_y_be[t];
+
+        delete[] ws_diag_al[t]; delete[] ws_e_al[t]; delete[] ws_f_al[t];
+        delete[] ws_b_al[t];    delete[] ws_x_al[t];
+        delete[] ws_l_al[t];    delete[] ws_u_al[t];
+        delete[] ws_d_al[t];    delete[] ws_y_al[t];
+    }
+    delete[] ws_diag_be; delete[] ws_e_be; delete[] ws_f_be;
+    delete[] ws_b_be;    delete[] ws_x_be;
+    delete[] ws_l_be;    delete[] ws_u_be;
+    delete[] ws_d_be;    delete[] ws_y_be;
+
+    delete[] ws_diag_al; delete[] ws_e_al; delete[] ws_f_al;
+    delete[] ws_b_al;    delete[] ws_x_al;
+    delete[] ws_l_al;    delete[] ws_u_al;
+    delete[] ws_d_al;    delete[] ws_y_al;
+}
+
+/*==========================================================================
+ * Precomputed Spectral Derivative Matrices
+ *
+ * Mathematical equivalence proof:
+ *
+ * Original algorithm (per-line):
+ *   1. Forward Chebyshev transform:  c = T * f   (where T is the DCT matrix)
+ *   2. Spectral derivative:          c' = Dhat * c  (recurrence relation)
+ *   3. Inverse transform:            f' = T^{-1} * c'
+ *   Combined: f' = T^{-1} * Dhat * T * f = D * f
+ *
+ * The matrix D = T^{-1} * Dhat * T is precomputed once.
+ * Similarly D2 = T^{-1} * Dhat^2 * T for second derivatives.
+ *
+ * For Fourier: same idea with DFT matrices and frequency-domain derivatives.
+ *
+ * This converts n2*n3 separate O(n1^2) transforms into a single
+ * (n1 x n1) * (n1 x n2*n3) DGEMM call, which is BLAS Level-3
+ * and thus optimally parallelized by MKL.
+ *=========================================================================*/
+
+void TwoPunctures::build_cheb_deriv_matrices(int n, double *D1, double *D2)
+{
+    /* Build the physical-space derivative matrices for Chebyshev Zeros grid.
+     *
+     * Grid points: x_i = -cos(pi*(2i+1)/(2n)), i=0,...,n-1
+     *
+     * Method: Construct T (forward transform), Dhat (spectral derivative),
+     * T^{-1} (inverse transform), then D1 = T^{-1} * Dhat * T,
+     * D2 = T^{-1} * Dhat^2 * T.
+     *
+     * All matrices are n x n, stored in row-major order: M[i*n+j]
+     */
+
+    double *T_fwd  = new double[n * n];  // Forward transform matrix
+    double *T_inv  = new double[n * n];  // Inverse transform matrix
+    double *Dhat   = new double[n * n];  // Spectral derivative operator
+    double *Dhat2  = new double[n * n];  // Spectral second derivative operator
+    double *tmp1   = new double[n * n];  // Temporary
+    double *tmp2   = new double[n * n];  // Temporary
+
+    double Pion = Pi / n;
+
+    // Build forward Chebyshev transform matrix T
+    // c_j = (2/n) * (-1)^j * sum_k f_k * cos(pi*j*(k+0.5)/n)
+    // So T[j][k] = (2/n) * (-1)^j * cos(pi*j*(k+0.5)/n)
+    for (int j = 0; j < n; j++) {
+        double fac = (2.0 / n) * ((j % 2 == 0) ? 1.0 : -1.0);
+        for (int k = 0; k < n; k++) {
+            T_fwd[j * n + k] = fac * cos(Pion * j * (k + 0.5));
+        }
+    }
+
+    // Build inverse Chebyshev transform matrix T^{-1}
+    // f_j = sum_k c_k * cos(pi*(j+0.5)*k/n) * (-1)^k  -  0.5*c_0
+    // But the -0.5*c_0 term is part of the sum when we write it as:
+    // f_j = -0.5*c_0 + sum_{k=0}^{n-1} c_k * cos(pi*(j+0.5)*k) * (-1)^k
+    // T_inv[j][k] = cos(pi*(j+0.5)*k/n) * (-1)^k, with k=0 term having extra -0.5
+    for (int j = 0; j < n; j++) {
+        for (int k = 0; k < n; k++) {
+            double sign_k = (k % 2 == 0) ? 1.0 : -1.0;
+            T_inv[j * n + k] = cos(Pion * (j + 0.5) * k) * sign_k;
+        }
+        // The k=0 term needs adjustment: the sum includes c_0*1 but we need -0.5*c_0 + c_0*1 = 0.5*c_0
+        // Wait, let me re-examine chebft_Zeros with inv=1:
+        //   sum = -0.5 * u[0];
+        //   for k: sum += u[k] * cos(Pion*(j+0.5)*k) * isignum; isignum alternates starting from 1
+        // So: c[j] = -0.5*u[0] + sum_{k=0}^{n-1} u[k]*cos(...)*(-1)^k
+        //          = -0.5*u[0] + u[0]*1*1 + sum_{k=1} ...
+        //          = 0.5*u[0] + sum_{k=1} u[k]*cos(...)*(-1)^k
+        // Equivalently: T_inv[j][0] = 0.5, T_inv[j][k] = cos(...)*(-1)^k for k>=1
+        // But cos(0) = 1 and (-1)^0 = 1, so the formula gives T_inv[j][0] = 1.0
+        // We need it to be 0.5. Fix:
+        T_inv[j * n + 0] = 0.5;  // This accounts for the -0.5*u[0] + u[0]*cos(0)*1 = 0.5*u[0]
+    }
+
+    // Build spectral derivative matrix Dhat (in coefficient space)
+    // The recurrence: cder[n-1] = 0, cder[n-2] = 0,
+    //                 cder[j] = cder[j+2] + 2*(j+1)*c[j+1]  for j = n-3,...,0
+    // This means cder = Dhat * c, where Dhat is upper triangular-ish.
+    // Dhat[j][k] = coefficient of c[k] contributing to cder[j]
+    //
+    // From the recurrence: cder[j] = sum_{k=j+1, k-j odd}^{n-1} 2*k * c[k]
+    // (with the factor 2k, summing over k > j where k-j is odd)
+    // Exception: cder[0] gets an extra factor of 0.5 since c[0] has the 2/n prefactor
+    // Actually no: the chder function is:
+    //   cder[n] = cder[n-1] = 0
+    //   cder[j] = cder[j+2] + 2*(j+1)*c[j+1]
+    // Unrolling: cder[j] = 2*(j+1)*c[j+1] + 2*(j+3)*c[j+3] + ...
+    // So Dhat[j][k] = 2*k if k > j and (k-j) is odd, else 0
+
+    for (int j = 0; j < n; j++)
+        for (int k = 0; k < n; k++)
+            Dhat[j * n + k] = 0.0;
+
+    for (int j = 0; j < n; j++) {
+        for (int k = j + 1; k < n; k++) {
+            if ((k - j) % 2 == 1) {
+                Dhat[j * n + k] = 2.0 * k;
+            }
+        }
+    }
+
+    // Build Dhat^2 = Dhat * Dhat
+    // D1 = T_inv * Dhat * T_fwd
+    // D2 = T_inv * Dhat^2 * T_fwd
+
+    // tmp1 = Dhat * T_fwd
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n, n, n, 1.0, Dhat, n, T_fwd, n, 0.0, tmp1, n);
+    // D1 = T_inv * tmp1
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D1, n);
+
+    // tmp2 = Dhat * Dhat  (Dhat^2 in spectral space)
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n, n, n, 1.0, Dhat, n, Dhat, n, 0.0, tmp2, n);
+    // tmp1 = Dhat^2 * T_fwd
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n, n, n, 1.0, tmp2, n, T_fwd, n, 0.0, tmp1, n);
+    // D2 = T_inv * tmp1
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n, n, n, 1.0, T_inv, n, tmp1, n, 0.0, D2, n);
+
+    delete[] T_fwd;
+    delete[] T_inv;
+    delete[] Dhat;
+    delete[] Dhat2;
+    delete[] tmp1;
+    delete[] tmp2;
+}
+
+void TwoPunctures::build_fourier_deriv_matrices(int N, double *DF1, double *DF2)
+{
+    /* Build Fourier derivative matrices in physical space.
+     *
+     * Grid: phi_k = 2*pi*k/N, k=0,...,N-1
+     *
+     * The Fourier interpolant derivative at grid points can be expressed as
+     * a matrix multiply. We build it by:
+     *   1. Forward Fourier transform matrix F
+     *   2. Frequency-domain derivative (multiply by il for first, -l^2 for second)
+     *   3. Inverse Fourier transform matrix F^{-1}
+     *   DF1 = F^{-1} * diag(il) * F, DF2 = F^{-1} * diag(-l^2) * F
+     *
+     * But since fourft/fourev use a real representation (a_l, b_l),
+     * we construct directly in physical space.
+     */
+
+    int M = N / 2;
+    double Pi_fac = Pi / M;  // = 2*Pi/N
+
+    // DF1[j][k] = d/dphi of the interpolant at phi_j, due to value at phi_k
+    // Using the representation:
+    //   f(phi) = 0.5*(a_0 + a_M*cos(M*phi)) + sum_{l=1}^{M-1} (a_l*cos(l*phi) + b_l*sin(l*phi))
+    // where a_l = (2/N)*sum_k f_k*cos(l*phi_k), b_l = (2/N)*sum_k f_k*sin(l*phi_k)
+    //
+    // f'(phi) = -0.5*a_M*M*sin(M*phi) + sum_{l=1}^{M-1} l*(-a_l*sin(l*phi) + b_l*cos(l*phi))
+    //
+    // Substituting a_l, b_l and evaluating at phi_j:
+    // f'(phi_j) = sum_k f_k * K(j,k)
+    // where K(j,k) = (2/N) * sum_{l=1}^{M-1} l * (-cos(l*phi_k)*sin(l*phi_j) + sin(l*phi_k)*cos(l*phi_j))
+    //             + (2/N) * (-M/2) * sin(M*phi_j) * cos(M*phi_k)    [a_M term, note a_M has no factor 2]
+    //             = (2/N) * sum_{l=1}^{M-1} l * sin(l*(phi_k - phi_j))
+    //             - (1/N) * M * sin(M*phi_j) * cos(M*phi_k)
+    //
+    // But the a_M coefficient in fourft has factor 1/M (not 2/M), so:
+    // Actually re-examining fourft: a[l] = fac * sum_k u[k]*cos(x), fac=1/M
+    // and a_M is stored as a[M] with same fac. The inverse uses:
+    //   u[k] = 0.5*(a[0] + a[M]*iy) + sum_{l=1}^{M-1}(a[l]*cos + b[l]*sin)
+    // So the full expression needs care. Let me just compute it numerically.
+
+    // Numerical approach: for each k, set f = delta_k, compute derivative at all j
+    double *p  = new double[N];
+    double *dp = new double[N];
+
+    for (int k = 0; k < N; k++) {
+        // Set delta function at k
+        for (int i = 0; i < N; i++)
+            p[i] = (i == k) ? 1.0 : 0.0;
+
+        // Forward Fourier transform (using existing fourft)
+        fourft(p, N, 0);
+        // Derivative in spectral space
+        fourder(p, dp, N);
+        // Inverse Fourier transform
+        fourft(dp, N, 1);
+
+        // dp[j] = derivative of delta_k interpolant at phi_j
+        // So DF1[j][k] = dp[j]
+        for (int j = 0; j < N; j++)
+            DF1[j * N + k] = dp[j];
+    }
+
+    // Second derivative
+    for (int k = 0; k < N; k++) {
+        for (int i = 0; i < N; i++)
+            p[i] = (i == k) ? 1.0 : 0.0;
+
+        fourft(p, N, 0);
+        fourder2(p, dp, N);
+        fourft(dp, N, 1);
+
+        for (int j = 0; j < N; j++)
+            DF2[j * N + k] = dp[j];
+    }
+
+    delete[] p;
+    delete[] dp;
+}
+
+void TwoPunctures::precompute_derivative_matrices()
+{
+    int n1 = npoints_A, n2 = npoints_B, n3 = npoints_phi;
+
+    // Allocate matrices
+    D1_A    = new double[n1 * n1];
+    D2_A    = new double[n1 * n1];
+    D1_B    = new double[n2 * n2];
+    D2_B    = new double[n2 * n2];
+    DF1_phi = new double[n3 * n3];
+    DF2_phi = new double[n3 * n3];
+
+    // Build Chebyshev derivative matrices
+    build_cheb_deriv_matrices(n1, D1_A, D2_A);
+    build_cheb_deriv_matrices(n2, D1_B, D2_B);
+
+    // Build Fourier derivative matrices
+    build_fourier_deriv_matrices(n3, DF1_phi, DF2_phi);
+
+    printf("Precomputed derivative matrices: A(%d), B(%d), phi(%d)\n", n1, n2, n3);
+}
+
+/* --------------------------------------------------------------------------
+ * Derivatives_AB3_MatMul: Drop-in replacement for Derivatives_AB3
+ *
+ * Uses precomputed derivative matrices and DGEMM to compute all spectral
+ * derivatives in batch. Mathematically equivalent to the original
+ * Derivatives_AB3.
+ *
+ * Memory layout of v.d0[Index(ivar,i,j,k)] = v.d0[ivar + nvar*(i + n1*(j + n2*k))]
+ *
+ * For A-direction derivatives (fixed j,k, varying i):
+ *   We need to apply D1_A and D2_A to "pencils" along the i-direction.
+ *   Collect all pencils into a matrix and use DGEMM.
+ *
+ * For B-direction derivatives (fixed i,k, varying j):
+ *   Similarly with D1_B, D2_B.
+ *
+ * For phi-direction (fixed i,j, varying k):
+ *   Similarly with DF1_phi, DF2_phi.
+ * --------------------------------------------------------------------------*/
+void TwoPunctures::Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v)
+{
+    int total_pencils;
+    double *data_in, *data_out;
+
+    /*=====================================================
+     * STEP 1: A-direction derivatives (Chebyshev, D1_A, D2_A)
+     *
+     * For each (ivar, j, k), we have a pencil of length n1:
+     *   f[i] = v.d0[Index(ivar, i, j, k, nvar, n1, n2, n3)]
+     *
+     * We want: v.d1[...] = D1_A * f, v.d11[...] = D2_A * f
+     *
+     * Collect all n2*n3*nvar pencils as columns of a matrix:
+     *   data_in[i, col] where col = ivar + nvar*(j + n2*k)
+     * Then: data_out = D1_A * data_in  (DGEMM: n1 x n1  times  n1 x total_pencils)
+     *=====================================================*/
+    total_pencils = nvar * n2 * n3;
+
+    data_in  = new double[n1 * total_pencils];
+    data_out = new double[n1 * total_pencils];
+
+    // Gather: data_in[i * total_pencils + col] = v.d0[Index(ivar,i,j,k,...)]
+    // where col = ivar + nvar * (j + n2 * k)
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (j + n2 * k);
+                for (int i = 0; i < n1; i++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    data_in[i * total_pencils + col] = v.d0[indx];
+                }
+            }
+        }
+    }
+
+    // First derivative: data_out = D1_A * data_in
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n1, total_pencils, n1,
+                1.0, D1_A, n1, data_in, total_pencils,
+                0.0, data_out, total_pencils);
+
+    // Scatter to v.d1
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (j + n2 * k);
+                for (int i = 0; i < n1; i++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d1[indx] = data_out[i * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    // Second derivative: data_out = D2_A * data_in
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n1, total_pencils, n1,
+                1.0, D2_A, n1, data_in, total_pencils,
+                0.0, data_out, total_pencils);
+
+    // Scatter to v.d11
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (j + n2 * k);
+                for (int i = 0; i < n1; i++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d11[indx] = data_out[i * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    delete[] data_in;
+    delete[] data_out;
+
+    /*=====================================================
+     * STEP 2: B-direction derivatives (Chebyshev, D1_B, D2_B)
+     *
+     * Pencils along j for each (ivar, i, k).
+     * Also compute mixed derivative v.d12 = D1_B applied to v.d1
+     *=====================================================*/
+    total_pencils = nvar * n1 * n3;
+
+    data_in  = new double[n2 * total_pencils];
+    data_out = new double[n2 * total_pencils];
+    double *data_in2 = new double[n2 * total_pencils];
+    double *data_out2 = new double[n2 * total_pencils];
+
+    // Gather v.d0 along B-direction AND v.d1 for mixed derivative
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int i = 0; i < n1; i++) {
+                int col = ivar + nvar * (i + n1 * k);
+                for (int j = 0; j < n2; j++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    data_in[j * total_pencils + col]  = v.d0[indx];
+                    data_in2[j * total_pencils + col] = v.d1[indx]; // for d/dB of (dv/dA)
+                }
+            }
+        }
+    }
+
+    // v.d2 = D1_B * v.d0 (along B)
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n2, total_pencils, n2,
+                1.0, D1_B, n2, data_in, total_pencils,
+                0.0, data_out, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int i = 0; i < n1; i++) {
+                int col = ivar + nvar * (i + n1 * k);
+                for (int j = 0; j < n2; j++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d2[indx] = data_out[j * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    // v.d22 = D2_B * v.d0
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n2, total_pencils, n2,
+                1.0, D2_B, n2, data_in, total_pencils,
+                0.0, data_out, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int i = 0; i < n1; i++) {
+                int col = ivar + nvar * (i + n1 * k);
+                for (int j = 0; j < n2; j++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d22[indx] = data_out[j * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    // v.d12 = D1_B * v.d1 (mixed: d/dB of dv/dA)
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n2, total_pencils, n2,
+                1.0, D1_B, n2, data_in2, total_pencils,
+                0.0, data_out2, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int k = 0; k < n3; k++) {
+            for (int i = 0; i < n1; i++) {
+                int col = ivar + nvar * (i + n1 * k);
+                for (int j = 0; j < n2; j++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d12[indx] = data_out2[j * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    delete[] data_in;
+    delete[] data_out;
+    delete[] data_in2;
+    delete[] data_out2;
+
+    /*=====================================================
+     * STEP 3: phi-direction derivatives (Fourier, DF1_phi, DF2_phi)
+     *
+     * Pencils along k for each (ivar, i, j).
+     * Also compute mixed derivatives v.d13, v.d23
+     *=====================================================*/
+    total_pencils = nvar * n1 * n2;
+
+    data_in    = new double[n3 * total_pencils];
+    data_out   = new double[n3 * total_pencils];
+    data_in2   = new double[n3 * total_pencils]; // for v.d1 → v.d13
+    data_out2  = new double[n3 * total_pencils];
+    double *data_in3  = new double[n3 * total_pencils]; // for v.d2 → v.d23
+    double *data_out3 = new double[n3 * total_pencils];
+
+    // Gather v.d0, v.d1, v.d2 along phi-direction
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int i = 0; i < n1; i++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (i + n1 * j);
+                for (int k = 0; k < n3; k++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    data_in[k * total_pencils + col]  = v.d0[indx];
+                    data_in2[k * total_pencils + col] = v.d1[indx];
+                    data_in3[k * total_pencils + col] = v.d2[indx];
+                }
+            }
+        }
+    }
+
+    // v.d3 = DF1_phi * v.d0
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n3, total_pencils, n3,
+                1.0, DF1_phi, n3, data_in, total_pencils,
+                0.0, data_out, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int i = 0; i < n1; i++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (i + n1 * j);
+                for (int k = 0; k < n3; k++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d3[indx] = data_out[k * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    // v.d33 = DF2_phi * v.d0
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n3, total_pencils, n3,
+                1.0, DF2_phi, n3, data_in, total_pencils,
+                0.0, data_out, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int i = 0; i < n1; i++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (i + n1 * j);
+                for (int k = 0; k < n3; k++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d33[indx] = data_out[k * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    // v.d13 = DF1_phi * v.d1 (mixed: d/dphi of dv/dA)
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n3, total_pencils, n3,
+                1.0, DF1_phi, n3, data_in2, total_pencils,
+                0.0, data_out2, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int i = 0; i < n1; i++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (i + n1 * j);
+                for (int k = 0; k < n3; k++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d13[indx] = data_out2[k * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    // v.d23 = DF1_phi * v.d2 (mixed: d/dphi of dv/dB)
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
+                n3, total_pencils, n3,
+                1.0, DF1_phi, n3, data_in3, total_pencils,
+                0.0, data_out3, total_pencils);
+
+    for (int ivar = 0; ivar < nvar; ivar++) {
+        for (int i = 0; i < n1; i++) {
+            for (int j = 0; j < n2; j++) {
+                int col = ivar + nvar * (i + n1 * j);
+                for (int k = 0; k < n3; k++) {
+                    int indx = Index(ivar, i, j, k, nvar, n1, n2, n3);
+                    v.d23[indx] = data_out3[k * total_pencils + col];
+                }
+            }
+        }
+    }
+
+    delete[] data_in;
+    delete[] data_out;
+    delete[] data_in2;
+    delete[] data_out2;
+    delete[] data_in3;
+    delete[] data_out3;
+}
+
diff --git a/AMSS_NCKU_source/TwoPunctures.h b/AMSS_NCKU_source/TwoPunctures.h
index 22fb359..5f95797 100644
--- a/AMSS_NCKU_source/TwoPunctures.h
+++ b/AMSS_NCKU_source/TwoPunctures.h
@@ -1,7 +1,8 @@
-
 #ifndef TWO_PUNCTURES_H
 #define TWO_PUNCTURES_H
 
+#include <omp.h>
+
 #define StencilSize 19
 #define N_PlaneRelax 1
 #define NRELAX 200
@@ -32,7 +33,7 @@ private:
        int npoints_A, npoints_B, npoints_phi;
 
        double target_M_plus, target_M_minus;
-       
+
        double admMass;
 
        double adm_tol;
@@ -42,6 +43,18 @@ private:
 
        int ntotal;
 
+       // ===== Precomputed spectral derivative matrices =====
+       double *D1_A, *D2_A;
+       double *D1_B, *D2_B;
+       double *DF1_phi, *DF2_phi;
+
+       // ===== Pre-allocated workspace for LineRelax (per-thread) =====
+       int max_threads;
+       double **ws_diag_be, **ws_e_be, **ws_f_be, **ws_b_be, **ws_x_be;
+       double **ws_l_be, **ws_u_be, **ws_d_be, **ws_y_be;
+       double **ws_diag_al, **ws_e_al, **ws_f_al, **ws_b_al, **ws_x_al;
+       double **ws_l_al, **ws_u_al, **ws_d_al, **ws_y_al;
+
        struct parameters
        {
               int nvar, n1, n2, n3;
@@ -58,6 +71,28 @@ public:
                     int Newtonmaxit);
        ~TwoPunctures();
 
+       // 02/07: New/modified methods
+       void allocate_workspace();
+       void free_workspace();
+       void precompute_derivative_matrices();
+       void build_cheb_deriv_matrices(int n, double *D1, double *D2);
+       void build_fourier_deriv_matrices(int N, double *DF1, double *DF2);
+       void Derivatives_AB3_MatMul(int nvar, int n1, int n2, int n3, derivs v);
+       void ThomasAlgorithm_ws(int N, double *b, double *a, double *c, double *x, double *q,
+                                double *l, double *u_ws, double *d, double *y);
+       void LineRelax_be_omp(double *dv,
+                         int const i, int const k, int const nvar,
+                         int const n1, int const n2, int const n3,
+                         double const *rhs, int const *ncols, int **cols,
+                         double **JFD, int tid);
+       void LineRelax_al_omp(double *dv,
+                         int const j, int const k, int const nvar,
+                         int const n1, int const n2, int const n3,
+                         double const *rhs, int const *ncols,
+                         int **cols, double **JFD, int tid);
+       void relax_omp(double *dv, int const nvar, int const n1, int const n2, int const n3,
+                  double const *rhs, int const *ncols, int **cols, double **JFD);
+
        void Solve();
        void set_initial_guess(derivs v);
        int index(int i, int j, int k, int l, int a, int b, int c, int d);
@@ -116,23 +151,11 @@ public:
        double BY_KKofxyz(double x, double y, double z);
        void SetMatrix_JFD(int nvar, int n1, int n2, int n3, derivs u, int *ncols, int **cols, double **Matrix);
        void J_times_dv(int nvar, int n1, int n2, int n3, derivs dv, double *Jdv, derivs u);
-       void relax(double *dv, int const nvar, int const n1, int const n2, int const n3,
-                  double const *rhs, int const *ncols, int **cols, double **JFD);
-       void LineRelax_be(double *dv,
-                         int const i, int const k, int const nvar,
-                         int const n1, int const n2, int const n3,
-                         double const *rhs, int const *ncols, int **cols,
-                         double **JFD);
        void JFD_times_dv(int i, int j, int k, int nvar, int n1, int n2,
                          int n3, derivs dv, derivs u, double *values);
        void LinEquations(double A, double B, double X, double R,
                          double x, double r, double phi,
                          double y, double z, derivs dU, derivs U, double *values);
-       void LineRelax_al(double *dv,
-                         int const j, int const k, int const nvar,
-                         int const n1, int const n2, int const n3,
-                         double const *rhs, int const *ncols,
-                         int **cols, double **JFD);
        void ThomasAlgorithm(int N, double *b, double *a, double *c, double *x, double *q);
        void Save(char *fname);
        // provided by Vasileios Paschalidis (vpaschal@illinois.edu)
@@ -141,4 +164,4 @@ public:
        void SpecCoef(parameters par, int ivar, double *v, double *cf);
 };
 
-#endif /* TWO_PUNCTURES_H */
+#endif /* TWO_PUNCTURES_H */
\ No newline at end of file
diff --git a/AMSS_NCKU_source/makefile.inc b/AMSS_NCKU_source/makefile.inc
index 489bbce..c25fcf1 100755
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -15,10 +15,9 @@ LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore
 ## -xHost: Optimize for the host CPU architecture (Intel/AMD compatible)
 ## -fp-model fast=2: Aggressive floating-point optimizations
 ## -fma: Enable fused multiply-add instructions
-## Note: OpenMP has been disabled (-qopenmp removed) due to performance issues
-CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+CXXAPPFLAGS  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
                -Dfortran3 -Dnewc -I${MKLROOT}/include
-f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo \
+f90appflags  = -O3 -xHost -fp-model fast=2 -fma -ipo -qopenmp \
                -align array64byte -fpp -I${MKLROOT}/include
 f90          = ifx
 f77          = ifx