Add two-node MPI launch configuration

Isolate TwoPuncture from ABE OMP settings
Use wall time for timestep logging
2026-03-30 21:13:46 +08:00 · 2026-03-30 21:00:20 +08:00 · 2026-03-30 20:38:41 +08:00 · 2026-03-30 20:34:34 +08:00
12 changed files with 227 additions and 119 deletions
--- a/AMSS_NCKU_Input.py
+++ b/AMSS_NCKU_Input.py
@@ -13,14 +13,17 @@ import numpy

 ## Setting MPI processes and the output file directory

-File_directory   = "GW150914"                    ## output file directory
-Output_directory = "binary_output"               ## binary data file directory
-                                                 ## The file directory name should not be too long
-MPI_processes    = 64                             ## number of mpi processes used in the simulation
-
-GPU_Calculation  = "no"                          ## Use GPU or not 
-                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
-CPU_Part         = 1.0
+File_directory   = "GW150914"                    ## output file directory
+Output_directory = "binary_output"               ## binary data file directory
+                                                 ## The file directory name should not be too long
+MPI_processes    = 64                            ## number of mpi processes used in the simulation
+OMP_Threads      = 3                             ## number of OpenMP threads used by each MPI process
+MPI_hosts        = ["localhost", "192.168.20.102"] ## MPI hosts for multi-node runs
+MPI_processes_per_node = 32                      ## MPI ranks launched on each node in MPI_hosts
+
+GPU_Calculation  = "no"                          ## Use GPU or not
+                                                 ## (prefer "no" in the current version, because the GPU part may have bugs when integrated in this Python interface)
+CPU_Part         = 1.0
 GPU_Part         = 0.0

 #################################################
@@ -50,7 +53,7 @@ Check_Time               = 100.0
 Dump_Time                = 100.0                  ## time inteval dT for dumping binary data
 D2_Dump_Time             = 100.0                  ## dump the ascii data for 2d surface after dT'
 Analysis_Time            = 0.1                    ## dump the puncture position and GW psi4 after dT"
-Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
+Evolution_Step_Number    = 10000000               ## stop the calculation after the maximal step number
 Courant_Factor           = 0.5                    ## Courant Factor
 Dissipation              = 0.15                   ## Kreiss-Oliger Dissipation Strength

--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -2034,9 +2034,9 @@ void bssn_class::Read_Ansorg()

 //================================================================================================

-void bssn_class::Evolve(int Steps)
-{
-  clock_t prev_clock, curr_clock;
+void bssn_class::Evolve(int Steps)
+{
+  double prev_clock = 0.0, curr_clock = 0.0;
  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
  LastAnas = 0;
 #if 0
@@ -2145,14 +2145,14 @@ void bssn_class::Evolve(int Steps)

  GH->settrfls(trfls);

-  for (int ncount = 1; ncount < Steps + 1; ncount++)
-  {
+  for (int ncount = 1; ncount < Steps + 1; ncount++)
+  {
    // special for large mass ratio consideration
    //     if(fabs(Porg0[0][0]-Porg0[1][0])+fabs(Porg0[0][1]-Porg0[1][1])+fabs(Porg0[0][2]-Porg0[1][2])<1e-6) 
    //     { GH->levels=GH->movls; }

-    if (myrank == 0)
-      curr_clock = clock();
+    if (myrank == 0)
+      curr_clock = MPI_Wtime();
 #if (PSTR == 0)
    RecursiveStep(0);
 #elif (PSTR == 1 || PSTR == 2 || PSTR == 3)
@@ -2205,16 +2205,16 @@ void bssn_class::Evolve(int Steps)
      }
    }

-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << endl;
-      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
-           << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) 
-           << " seconds! " << endl;
-      // cout << endl;
-    }
+    if (myrank == 0)
+    {
+      prev_clock = curr_clock;
+      curr_clock = MPI_Wtime();
+      cout << endl;
+      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime << "   "
+           << " Computer used " << (curr_clock - prev_clock)
+           << " seconds! " << endl;
+      // cout << endl;
+    }

    if (PhysTime >= TotalTime)
      break;
--- a/AMSS_NCKU_source/bssn_gpu_class.C
+++ b/AMSS_NCKU_source/bssn_gpu_class.C
@@ -1891,7 +1891,7 @@ void bssn_class::Read_Ansorg()
 void bssn_class::Evolve(int Steps)
 {

-  clock_t prev_clock, curr_clock;
+  double prev_clock = 0.0, curr_clock = 0.0;
  double LastDump = 0.0, LastCheck = 0.0, Last2dDump = 0.0;
  LastAnas = 0;
 #if 0
@@ -2035,10 +2035,12 @@ void bssn_class::Evolve(int Steps)

  GH->settrfls(trfls);

-  for (int ncount = 1; ncount < Steps + 1; ncount++)
-  {
-    cout << "Before Step: " << ncount << " My Rank: " << myrank 
-         << " takes " << MPI_Wtime() - beg_time << " seconds!" << endl;
+  for (int ncount = 1; ncount < Steps + 1; ncount++)
+  {
+    if (myrank == 0)
+      curr_clock = MPI_Wtime();
+    cout << "Before Step: " << ncount << " My Rank: " << myrank 
+         << " takes " << MPI_Wtime() - beg_time << " seconds!" << endl;
    beg_time = MPI_Wtime();
 #if (PSTR == 0)
    RecursiveStep(0);
@@ -2095,10 +2097,10 @@ void bssn_class::Evolve(int Steps)

    if (myrank == 0)
    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << "Timestep # " << ncount << ": integrating to time: " << PhysTime << endl;
-      cout << "used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds!" << endl;
+      prev_clock = curr_clock;
+      curr_clock = MPI_Wtime();
+      cout << "Timestep # " << ncount << ": integrating to time: " << PhysTime << endl;
+      cout << "used " << (curr_clock - prev_clock) << " seconds!" << endl;
    }

    if (PhysTime >= TotalTime)
--- a/AMSS_NCKU_source/bssn_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_rhs_c.C
@@ -2,6 +2,24 @@
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
+
+#ifdef _OPENMP
+#define BSSN_OMP_TASK_GROUP_BEGIN \
+    _Pragma("omp parallel")       \
+    {                             \
+        _Pragma("omp single nowait") \
+        {
+#define BSSN_OMP_TASK_CALL(...) \
+            _Pragma("omp task") { __VA_ARGS__; }
+#define BSSN_OMP_TASK_GROUP_END \
+            _Pragma("omp taskwait") \
+        } \
+    }
+#else
+#define BSSN_OMP_TASK_GROUP_BEGIN {
+#define BSSN_OMP_TASK_CALL(...) { __VA_ARGS__; }
+#define BSSN_OMP_TASK_GROUP_END }
+#endif
 // 0-based i,j,k
 // #define IDX_F(i,j,k,nx,ny) ((i) + (j)*(nx) + (k)*(nx)*(ny))
 // ex(1)=nx, ex(2)=ny, ex(3)=nz
@@ -108,18 +126,20 @@ int f_compute_rhs_bssn(int *ex, double &T,
            chin1[i] = chi[i] + 1.0;
        }
        // 9ms //
-        fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev);
-        fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev);
-        fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev);
-        fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
-        fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev);
-        fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev);
-        fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev);
-        fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
-        fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev);
+        BSSN_OMP_TASK_GROUP_BEGIN
+        BSSN_OMP_TASK_CALL(fderivs(ex,betax,betaxx,betaxy,betaxz,X,Y,Z,ANTI, SYM, SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,betay,betayx,betayy,betayz,X,Y,Z, SYM,ANTI, SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,betaz,betazx,betazy,betazz,X,Y,Z, SYM, SYM,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,chi,chix,chiy,chiz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,dxx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,gxy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,gxz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,dyy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,gyz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,dzz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Lap,Lapx,Lapy,Lapz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,trK,Kx,Ky,Kz,X,Y,Z,SYM,SYM,SYM,Symmetry,Lev))
+        BSSN_OMP_TASK_GROUP_END

        // 3ms //
        for(int i=0;i<all;i+=1){
@@ -316,15 +336,14 @@ int f_compute_rhs_bssn(int *ex, double &T,
                        );
        }
        // 22.3ms //
-        fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,
-        X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev);
-        fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,
-        X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
-        fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,
-        X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev);
-        fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev);
-        fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev);
-        fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev);
+        BSSN_OMP_TASK_GROUP_BEGIN
+        BSSN_OMP_TASK_CALL(fdderivs(ex,betax,gxxx,gxyx,gxzx,gyyx,gyzx,gzzx,X,Y,Z,ANTI,SYM, SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fdderivs(ex,betay,gxxy,gxyy,gxzy,gyyy,gyzy,gzzy,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fdderivs(ex,betaz,gxxz,gxyz,gxzz,gyyz,gyzz,gzzz,X,Y,Z,SYM ,SYM, ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Gamx,Gamxx,Gamxy,Gamxz,X,Y,Z,ANTI,SYM ,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Gamy,Gamyx,Gamyy,Gamyz,X,Y,Z,SYM ,ANTI,SYM ,Symmetry,Lev))
+        BSSN_OMP_TASK_CALL(fderivs(ex,Gamz,Gamzx,Gamzy,Gamzz,X,Y,Z,SYM ,SYM ,ANTI,Symmetry,Lev))
+        BSSN_OMP_TASK_GROUP_END

        // Fused: fxx/Gamxa + Gamma_rhs part 2 (2 loops -> 1)
        for(int i=0;i<all;i+=1){
@@ -1063,30 +1082,32 @@ int f_compute_rhs_bssn(int *ex, double &T,
            #endif
        }
        // advection + KO dissipation with shared symmetry buffer
-        lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
-        lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
-        lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps);
-        lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
-        lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
-        lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps);
-        lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps);
-        lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps);
-        lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps);
-        lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps);
-        lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps);
+        BSSN_OMP_TASK_GROUP_BEGIN
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dxx,gxx_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Gamz,Gamz_rhs,betax,betay,betaz,Symmetry,SSA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,gxy,gxy_rhs,betax,betay,betaz,Symmetry,AAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Lap,Lap_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,gxz,gxz_rhs,betax,betay,betaz,Symmetry,ASA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,betax,betax_rhs,betax,betay,betaz,Symmetry,ASS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dyy,gyy_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,betay,betay_rhs,betax,betay,betaz,Symmetry,SAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,gyz,gyz_rhs,betax,betay,betaz,Symmetry,SAA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,betaz,betaz_rhs,betax,betay,betaz,Symmetry,SSA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dzz,gzz_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dtSfx,dtSfx_rhs,betax,betay,betaz,Symmetry,ASS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Axx,Axx_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dtSfy,dtSfy_rhs,betax,betay,betaz,Symmetry,SAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Axy,Axy_rhs,betax,betay,betaz,Symmetry,AAS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,dtSfz,dtSfz_rhs,betax,betay,betaz,Symmetry,SSA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Axz,Axz_rhs,betax,betay,betaz,Symmetry,ASA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Ayy,Ayy_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Ayz,Ayz_rhs,betax,betay,betaz,Symmetry,SAA,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Azz,Azz_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,chi,chi_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,trK,trK_rhs,betax,betay,betaz,Symmetry,SSS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Gamx,Gamx_rhs,betax,betay,betaz,Symmetry,ASS,eps))
+        BSSN_OMP_TASK_CALL(lopsided_kodis(ex,X,Y,Z,Gamy,Gamy_rhs,betax,betay,betaz,Symmetry,SAS,eps))
+        BSSN_OMP_TASK_GROUP_END
        // 2ms //
        if(co==0){
            for (int i=0;i<all;i+=1) {
@@ -1133,12 +1154,14 @@ int f_compute_rhs_bssn(int *ex, double &T,
            }

            // 1ms //
-            fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
-            fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0);
-            fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0);
-            fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
-            fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0);
-            fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0);
+            BSSN_OMP_TASK_GROUP_BEGIN
+            BSSN_OMP_TASK_CALL(fderivs(ex,Axx,gxxx,gxxy,gxxz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Axy,gxyx,gxyy,gxyz,X,Y,Z,ANTI,ANTI,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Axz,gxzx,gxzy,gxzz,X,Y,Z,ANTI,SYM ,ANTI,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Ayy,gyyx,gyyy,gyyz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Ayz,gyzx,gyzy,gyzz,X,Y,Z,SYM ,ANTI,ANTI,Symmetry,0))
+            BSSN_OMP_TASK_CALL(fderivs(ex,Azz,gzzx,gzzy,gzzz,X,Y,Z,SYM ,SYM ,SYM ,Symmetry,0))
+            BSSN_OMP_TASK_GROUP_END
            // 7ms //
            for (int i=0;i<all;i+=1) {
                gxxx[i] = gxxx[i] - (  Gamxxx[i] * Axx[i] + Gamyxx[i] * Axy[i] + Gamzxx[i] * Axz[i]
--- a/AMSS_NCKU_source/fdderivs_c.C
+++ b/AMSS_NCKU_source/fdderivs_c.C
@@ -41,8 +41,8 @@ void fdderivs(const int ex[3],
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;

-    static double *fh = NULL;
-    static size_t cap = 0;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

    if (fh_size > cap) {
        free(fh);
--- a/AMSS_NCKU_source/fderivs_c.C
+++ b/AMSS_NCKU_source/fderivs_c.C
@@ -50,8 +50,8 @@ void fderivs(const int ex[3],
    const size_t ny = (size_t)ex2 + 2;
    const size_t nz = (size_t)ex3 + 2;
    const size_t fh_size = nx * ny * nz;
-    static double *fh = NULL;
-    static size_t cap = 0;
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;

    if (fh_size > cap) {
        free(fh);
--- a/AMSS_NCKU_source/lopsided_kodis_c.C
+++ b/AMSS_NCKU_source/lopsided_kodis_c.C
@@ -43,7 +43,13 @@ void lopsided_kodis(const int ex[3],
    const size_t nz = (size_t)ex3 + 3;
    const size_t fh_size = nx * ny * nz;

-    double *fh = (double*)malloc(fh_size * sizeof(double));
+    static thread_local double *fh = NULL;
+    static thread_local size_t cap = 0;
+    if (fh_size > cap) {
+        free(fh);
+        fh = (double*)aligned_alloc(64, fh_size * sizeof(double));
+        cap = fh_size;
+    }
    if (!fh) return;

    symmetry_bd(3, ex, f, fh, SoA);
@@ -243,6 +249,4 @@ void lopsided_kodis(const int ex[3],
            }
        }
    }
-
-    free(fh);
 }
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -16,7 +16,7 @@ PROFDATA = /home/$(shell whoami)/AMSS-NCKU/pgo_profile/default.profdata
 ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) $(OPENMP_FLAGS)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
@@ -26,7 +26,7 @@ else


 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
-              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS)
+              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) $(OPENMP_FLAGS)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -29,6 +29,9 @@ endif
 ##   instrument : PGO Phase 1 instrumentation to collect fresh profile data
 PGO_MODE ?= opt

+## OpenMP switch for C/C++ kernels
+OPENMP_FLAGS ?= -qopenmp
+
 ## Interp_Points load balance profiling mode
 ##   off        : (default) no load balance instrumentation
 ##   profile    : Pass 1 — instrument Interp_Points to collect timing profile
--- a/AMSS_NCKU_source/scalar_class.C
+++ b/AMSS_NCKU_source/scalar_class.C
@@ -317,9 +317,9 @@ void scalar_class::Setup_Initial_Data()
 #endif
  }
 }
-void scalar_class::Evolve(int Steps)
-{
-  clock_t prev_clock, curr_clock;
+void scalar_class::Evolve(int Steps)
+{
+  double prev_clock = 0.0, curr_clock = 0.0;
  double LastDump = 0.0, LastCheck = 0.0;
  LastAnas = 0;

@@ -327,8 +327,8 @@ void scalar_class::Evolve(int Steps)

  for (int ncount = 1; ncount < Steps + 1; ncount++)
  {
-    if (myrank == 0)
-      curr_clock = clock();
+    if (myrank == 0)
+      curr_clock = MPI_Wtime();
    RecursiveStep(0);

    LastDump += dT_mon;
@@ -343,13 +343,13 @@ void scalar_class::Evolve(int Steps)
 #endif
      LastDump = 0;
    }
-    if (myrank == 0)
-    {
-      prev_clock = curr_clock;
-      curr_clock = clock();
-      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime
-           << " Computer used " << (double)(curr_clock - prev_clock) / ((double)CLOCKS_PER_SEC) << " seconds! " << endl;
-    }
+    if (myrank == 0)
+    {
+      prev_clock = curr_clock;
+      curr_clock = MPI_Wtime();
+      cout << " Timestep # " << ncount << ": integrating to time: " << PhysTime
+           << " Computer used " << (curr_clock - prev_clock) << " seconds! " << endl;
+    }
    if (PhysTime >= TotalTime)
      break;
  }
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -9,6 +9,7 @@


 import AMSS_NCKU_Input as input_data
+import os
 import subprocess
 import time

@@ -54,6 +55,44 @@ NUMACTL_CPU_BIND = get_last_n_cores_per_socket(n=32)
 BUILD_JOBS = 64


+def build_abe_runtime_env():
+    """Inject OpenMP runtime settings only for the main ABE evolution run."""
+    runtime_env = os.environ.copy()
+    omp_threads = max(1, int(getattr(input_data, "OMP_Threads", 1)))
+    runtime_env["OMP_NUM_THREADS"] = str(omp_threads)
+    return runtime_env
+
+
+def build_twopuncture_runtime_env():
+    """Let TwoPunctureABE use the runtime default instead of the ABE OMP override."""
+    runtime_env = os.environ.copy()
+    runtime_env.pop("OMP_NUM_THREADS", None)
+    runtime_env.pop("OMP_THREAD_LIMIT", None)
+    return runtime_env
+
+
+def build_mpi_launch_args():
+    """Build optional host-distribution arguments for mpirun."""
+    hosts = list(getattr(input_data, "MPI_hosts", []))
+    ppn = int(getattr(input_data, "MPI_processes_per_node", 0))
+
+    if not hosts:
+        return ""
+
+    if ppn > 0:
+        expected = len(hosts) * ppn
+        if int(input_data.MPI_processes) != expected:
+            raise ValueError(
+                f"MPI_processes={input_data.MPI_processes} does not match "
+                f"len(MPI_hosts) * MPI_processes_per_node = {expected}"
+            )
+
+    launch_args = f"-hosts {','.join(hosts)}"
+    if ppn > 0:
+        launch_args += f" -ppn {ppn}"
+    return launch_args
+
+
 ##################################################################


@@ -143,19 +182,38 @@ def run_ABE():
    print(                                                      )
    print( " Running the AMSS-NCKU executable file ABE/ABEGPU " ) 
    print(                                                      )
+    print( f" MPI processes = {input_data.MPI_processes}, OMP threads per process = {max(1, int(getattr(input_data, 'OMP_Threads', 1)))}" )
+    if getattr(input_data, "MPI_hosts", []):
+        print( f" MPI hosts = {getattr(input_data, 'MPI_hosts', [])}, MPI ranks per node = {int(getattr(input_data, 'MPI_processes_per_node', 0))}" )
+        print( " Multi-node runs require the working directory to be visible on all MPI hosts. " )
+    print(                                                                                                      )

    ## Define the command to run; cast other values to strings as needed
+    mpi_launch_args = build_mpi_launch_args()
    
    if (input_data.GPU_Calculation == "no"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun "
+        if mpi_launch_args:
+            mpi_command += mpi_launch_args + " "
+        mpi_command += "-np " + str(input_data.MPI_processes) + " ./ABE"
        #mpi_command         = " mpirun -np " + str(input_data.MPI_processes) + " ./ABE"
        mpi_command_outfile = "ABE_out.log"
    elif (input_data.GPU_Calculation == "yes"):
-        mpi_command         = NUMACTL_CPU_BIND + " mpirun -np " + str(input_data.MPI_processes) + " ./ABEGPU"
+        mpi_command         = NUMACTL_CPU_BIND + " mpirun "
+        if mpi_launch_args:
+            mpi_command += mpi_launch_args + " "
+        mpi_command += "-np " + str(input_data.MPI_processes) + " ./ABEGPU"
        mpi_command_outfile = "ABEGPU_out.log"
 
    ## Execute the MPI command and stream output
-    mpi_process = subprocess.Popen(mpi_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    mpi_process = subprocess.Popen(
+        mpi_command,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        env=build_abe_runtime_env(),
+    )

    ## Write ABE run output to file while printing to stdout
    with open(mpi_command_outfile, 'w') as file0:  
@@ -195,7 +253,14 @@ def run_TwoPunctureABE():
    TwoPuncture_command_outfile = "TwoPunctureABE_out.log"

    ## Execute the command with subprocess.Popen and stream output
-    TwoPuncture_process = subprocess.Popen(TwoPuncture_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    TwoPuncture_process = subprocess.Popen(
+        TwoPuncture_command,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        env=build_twopuncture_runtime_env(),
+    )

    ## Write TwoPunctureABE run output to file while printing to stdout
    with open(TwoPuncture_command_outfile, 'w') as file0:  
--- a/setup.py
+++ b/setup.py
@@ -65,10 +65,14 @@ def print_input_data( File_directory ):
        
    print( "------------------------------------------------------------------------------------------" )
    print(                                                                                           )
-    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation "                  )
-    print(                                                                                           )
-    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes   ) 
-    print(                                                                                           )
+    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation "                  )
+    print(                                                                                           )
+    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes   )
+    print( " The number of OMP threads per MPI process         = ", input_data.OMP_Threads           )
+    if getattr(input_data, "MPI_hosts", []):
+        print( " The MPI host list in the AMSS-NCKU simulation   = ", input_data.MPI_hosts             )
+        print( " The number of MPI ranks launched per host       = ", input_data.MPI_processes_per_node )
+    print(                                                                                           )
    print( " The form of computational equation  = ",            input_data.Equation_Class           )
    print( " The initial data in this simulation = ",            input_data.Initial_Data_Method      )
    print(                                                                                           )
@@ -140,10 +144,14 @@ def print_input_data( File_directory ):
    file0    = open(filepath, 'w')
    
    print(                                                                                              file=file0 )
-    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation ",                    file=file0 )
-    print(                                                                                              file=file0 )
-    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes,     file=file0 ) 
-    print(                                                                                              file=file0 )
+    print( " Printing the basic parameter and setting in the AMSS-NCKU simulation ",                    file=file0 )
+    print(                                                                                              file=file0 )
+    print( " The number of MPI processes in the AMSS-NCKU simulation = ", input_data.MPI_processes,     file=file0 )
+    print( " The number of OMP threads per MPI process         = ", input_data.OMP_Threads,             file=file0 )
+    if getattr(input_data, "MPI_hosts", []):
+        print( " The MPI host list in the AMSS-NCKU simulation   = ", input_data.MPI_hosts,               file=file0 )
+        print( " The number of MPI ranks launched per host       = ", input_data.MPI_processes_per_node,   file=file0 )
+    print(                                                                                              file=file0 )
    print( " The form of computational equation  = ",            input_data.Equation_Class,             file=file0 )
    print( " The initial data in this simulation = ",            input_data.Initial_Data_Method,        file=file0 )
    print(                                                                                              file=file0 )
Author	SHA1	Message	Date
CGH0S7	d96ca6ed2a	Add two-node MPI launch configuration	2026-03-30 21:13:46 +08:00
CGH0S7	60ad63e8cc	Isolate TwoPuncture from ABE OMP settings	2026-03-30 21:00:20 +08:00
CGH0S7	087d034ee3	Use wall time for timestep logging	2026-03-30 20:38:41 +08:00
CGH0S7	5f664716ab	Enable OpenMP task parallelism for C kernels	2026-03-30 20:34:34 +08:00