From cb911dec06b6a68101d8f66c6c26cc72ad58aabb Mon Sep 17 00:00:00 2001
From: CGH0S7 <776459475@qq.com>
Date: Thu, 7 May 2026 12:18:56 +0800
Subject: [PATCH] Add EM GPU fast paths and defaults

---
 AMSS_NCKU_source/Parallel.C       |  21 +-
 AMSS_NCKU_source/bssnEM_class.C   | 906 ++++++++++++++++++++++++-----
 AMSS_NCKU_source/bssn_class.C     |  13 +-
 AMSS_NCKU_source/bssn_rhs_cuda.cu | 931 +++++++++++++++++++++++++++++-
 AMSS_NCKU_source/bssn_rhs_cuda.h  |  24 +
 makefile_and_run.py               |   8 +-
 6 files changed, 1720 insertions(+), 183 deletions(-)
diff --git a/AMSS_NCKU_source/Parallel.C b/AMSS_NCKU_source/Parallel.C
index bc37930..b7f2a51 100644
--- a/AMSS_NCKU_source/Parallel.C
+++ b/AMSS_NCKU_source/Parallel.C
@@ -18,7 +18,7 @@
 #endif
 #if USE_CUDA_BSSN
 #include "bssn_rhs_cuda.h"
-#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_ESCALAR_CUDA_STATE_COUNT
+#define AMSS_BSSN_CUDA_MAX_STATE_COUNT BSSN_EM_CUDA_STATE_COUNT
 #endif
 #if USE_CUDA_Z4C
 #include "z4c_rhs_cuda.h"
@@ -181,8 +181,7 @@ bool cuda_build_bssn_host_views(Block *block,
                                 double **views)
 {
   if (!block || !vars || !views ||
-      (state_count != BSSN_CUDA_STATE_COUNT &&
-       state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
+      state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
   MyList<var> *v = vars;
   for (int i = 0; i < state_count; ++i)
@@ -200,8 +199,7 @@ bool cuda_build_bssn_soa(MyList<var> *vars,
                          double *soa_flat)
 {
   if (!vars || !soa_flat ||
-      (state_count != BSSN_CUDA_STATE_COUNT &&
-       state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
+      state_count <= 0 || state_count > AMSS_BSSN_CUDA_MAX_STATE_COUNT)
     return false;
   MyList<var> *v = vars;
   for (int i = 0; i < state_count; ++i)
@@ -322,7 +320,7 @@ bool cuda_state_count_direct_supported(int state_count)
 #if USE_CUDA_Z4C && (ABEtype == 2)
   return state_count == Z4C_CUDA_STATE_COUNT;
 #elif USE_CUDA_BSSN
-  return state_count > 0 && state_count <= BSSN_ESCALAR_CUDA_STATE_COUNT;
+  return state_count > 0 && state_count <= AMSS_BSSN_CUDA_MAX_STATE_COUNT;
 #else
   (void)state_count;
   return false;
@@ -550,7 +548,8 @@ bool cuda_uncached_device_buffers_enabled(int state_count)
   }
   if (!enabled)
     return false;
-  if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT)
+  if (state_count != BSSN_ESCALAR_CUDA_STATE_COUNT &&
+      state_count != BSSN_EM_CUDA_STATE_COUNT)
     return false;
   return cuda_aware_mpi_enabled();
 #else
@@ -6136,6 +6135,7 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
                                MyList<var> *VarList1, MyList<var> *VarList2,
                                int Symmetry, SyncCache &cache)
 {
+  const double t_transfer = sync_profile_enabled() ? MPI_Wtime() : 0.0;
   int myrank;
   MPI_Comm_size(MPI_COMM_WORLD, &cache.cpusize);
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
@@ -6324,6 +6324,13 @@ void Parallel::transfer_cached(MyList<Parallel::gridseg> **src, MyList<Parallel:
     else
       data_packer(cache.recv_bufs[myrank], src[myrank], dst[myrank], myrank, UNPACK, VarList1, VarList2, Symmetry);
   }
+  if (sync_profile_enabled())
+  {
+    SyncProfileStats &stats = sync_profile_stats();
+    stats.finish_calls++;
+    stats.finish_sec += MPI_Wtime() - t_transfer;
+    sync_profile_maybe_log();
+  }
 }
 void Parallel::Sync_ensure_cache(MyList<Patch> *PatL, int Symmetry, SyncCache &cache)
 {
diff --git a/AMSS_NCKU_source/bssnEM_class.C b/AMSS_NCKU_source/bssnEM_class.C
index 0a9e199..763cf0f 100644
--- a/AMSS_NCKU_source/bssnEM_class.C
+++ b/AMSS_NCKU_source/bssnEM_class.C
@@ -1,25 +1,30 @@
 
 #ifdef newc
-#include <sstream>
-#include <cstdio>
-#include <map>
+#include <sstream>
+#include <cstdio>
+#include <cstdlib>
+#include <map>
 using namespace std;
 #else
 #include <stdio.h>
 #include <map.h>
 #endif
 
-#include <time.h>
+#include <time.h>
+#include <cstring>
 
 #include "macrodef.h"
 #include "misc.h"
 #include "Ansorg.h"
 #include "fmisc.h"
 #include "Parallel.h"
-#include "bssnEM_class.h"
-#include "bssn_rhs.h"
-#include "empart.h"
-#include "initial_puncture.h"
+#include "bssnEM_class.h"
+#include "bssn_rhs.h"
+#include "empart.h"
+#if USE_CUDA_BSSN
+#include "bssn_rhs_cuda.h"
+#endif
+#include "initial_puncture.h"
 #include "initial_maxwell.h"
 #include "enforce_algebra.h"
 #include "rungekutta4_rout.h"
@@ -34,11 +39,368 @@ using namespace std;
 #include "myglobal.h"
 #endif
 
-//================================================================================================
-
-// Define bssnEM_class
-
-// It inherits some members and methods from the parent class bssn_class and modifies others.
+//================================================================================================
+
+namespace
+{
+MyList<var> *advance_var_list(MyList<var> *vars, int count)
+{
+  while (vars && count > 0)
+  {
+    vars = vars->next;
+    --count;
+  }
+  return vars;
+}
+
+bool bssn_em_step_timing_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_EM_STEP_TIMING");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool bssn_em_step_timing_all_levels_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_EM_STEP_TIMING_ALL_LEVELS");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+#if USE_CUDA_BSSN
+bool bssn_em_zero_analysis_fastpath_enabled()
+{
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_EM_ZERO_ANALYSIS_FASTPATH");
+    enabled = (!env || atoi(env) != 0) ? 1 : 0;
+  }
+  return enabled != 0;
+}
+
+bool bssn_em_analysis_zero_fastpath_ready(MyList<Patch> *PatL,
+#ifdef WithShell
+                                          ShellPatch *shell,
+#else
+                                          ShellPatch * /*shell*/,
+#endif
+                                          int rank)
+{
+  if (!bssn_em_zero_analysis_fastpath_enabled())
+    return false;
+  int local_ok = 1;
+  int local_seen = 0;
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (rank == cg->rank)
+      {
+        local_seen = 1;
+        if (!bssn_em_cuda_resident_zero_fast_state(cg))
+          local_ok = 0;
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+#ifdef WithShell
+  if (shell && shell->PatL)
+  {
+    MyList<ss_patch> *SP = shell->PatL;
+    while (SP)
+    {
+      MyList<Block> *BP = SP->data->blb;
+      while (BP)
+      {
+        Block *cg = BP->data;
+        if (rank == cg->rank)
+        {
+          local_seen = 1;
+          if (!bssn_em_cuda_resident_zero_fast_state(cg))
+            local_ok = 0;
+        }
+        if (BP == SP->data->ble)
+          break;
+        BP = BP->next;
+      }
+      SP = SP->next;
+    }
+  }
+#endif
+  int global_ok = 0;
+  int global_seen = 0;
+  MPI_Allreduce(&local_ok, &global_ok, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
+  MPI_Allreduce(&local_seen, &global_seen, 1, MPI_INT, MPI_MAX, MPI_COMM_WORLD);
+  return global_seen && global_ok;
+}
+
+void zero_em_analysis_outputs(MyList<Patch> *PatL,
+#ifdef WithShell
+                              ShellPatch *shell,
+#else
+                              ShellPatch * /*shell*/,
+#endif
+                              int rank,
+                              var *Rphi2_var, var *Iphi2_var,
+                              var *Rphi1_var, var *Iphi1_var)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (rank == cg->rank)
+      {
+        const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
+        std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
+        std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
+        std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
+        std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+#ifdef WithShell
+  if (shell && shell->PatL)
+  {
+    MyList<ss_patch> *SP = shell->PatL;
+    while (SP)
+    {
+      MyList<Block> *BP = SP->data->blb;
+      while (BP)
+      {
+        Block *cg = BP->data;
+        if (rank == cg->rank)
+        {
+          const size_t all = (size_t)cg->shape[0] * cg->shape[1] * cg->shape[2];
+          std::memset(cg->fgfs[Rphi2_var->sgfn], 0, all * sizeof(double));
+          std::memset(cg->fgfs[Iphi2_var->sgfn], 0, all * sizeof(double));
+          std::memset(cg->fgfs[Rphi1_var->sgfn], 0, all * sizeof(double));
+          std::memset(cg->fgfs[Iphi1_var->sgfn], 0, all * sizeof(double));
+        }
+        if (BP == SP->data->ble)
+          break;
+        BP = BP->next;
+      }
+      SP = SP->next;
+    }
+  }
+#endif
+}
+#endif
+
+int bssn_em_step_timing_every()
+{
+  static int every = -1;
+  if (every < 0)
+  {
+    const char *env = getenv("AMSS_EM_STEP_TIMING_EVERY");
+    every = (env && atoi(env) > 0) ? atoi(env) : 1;
+  }
+  return every;
+}
+
+#if USE_CUDA_BSSN
+bool fill_bssn_em_bssn_cuda_views(Block *cg, MyList<var> *vars,
+                                  double **host_views,
+                                  double *propspeeds = 0,
+                                  double *soa_flat = 0)
+{
+  int idx = 0;
+  while (vars && idx < BSSN_CUDA_STATE_COUNT)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    if (propspeeds)
+      propspeeds[idx] = vars->data->propspeed;
+    if (soa_flat)
+    {
+      soa_flat[3 * idx + 0] = vars->data->SoA[0];
+      soa_flat[3 * idx + 1] = vars->data->SoA[1];
+      soa_flat[3 * idx + 2] = vars->data->SoA[2];
+    }
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == BSSN_CUDA_STATE_COUNT;
+}
+
+bool fill_bssn_em_cuda_views(Block *cg, MyList<var> *vars,
+                             double **host_views,
+                             double *propspeeds = 0,
+                             double *soa_flat = 0)
+{
+  int idx = 0;
+  while (vars && idx < BSSN_EM_CUDA_STATE_COUNT)
+  {
+    host_views[idx] = cg->fgfs[vars->data->sgfn];
+    if (propspeeds)
+      propspeeds[idx] = vars->data->propspeed;
+    if (soa_flat)
+    {
+      soa_flat[3 * idx + 0] = vars->data->SoA[0];
+      soa_flat[3 * idx + 1] = vars->data->SoA[1];
+      soa_flat[3 * idx + 2] = vars->data->SoA[2];
+    }
+    vars = vars->next;
+    ++idx;
+  }
+  return idx == BSSN_EM_CUDA_STATE_COUNT && vars == 0;
+}
+
+void fill_bssn_em_fixed_source_cuda_views(Block *cg, double **sources,
+                                          var *Jx, var *Jy, var *Jz, var *qchar)
+{
+  sources[0] = cg->fgfs[Jx->sgfn];
+  sources[1] = cg->fgfs[Jy->sgfn];
+  sources[2] = cg->fgfs[Jz->sgfn];
+  sources[3] = cg->fgfs[qchar->sgfn];
+}
+
+void fill_bssn_em_matter_cuda_views(Block *cg, double **matter,
+                                    var *rho, var *Sx, var *Sy, var *Sz,
+                                    var *Sxx, var *Sxy, var *Sxz,
+                                    var *Syy, var *Syz, var *Szz)
+{
+  matter[0] = cg->fgfs[rho->sgfn];
+  matter[1] = cg->fgfs[Sx->sgfn];
+  matter[2] = cg->fgfs[Sy->sgfn];
+  matter[3] = cg->fgfs[Sz->sgfn];
+  matter[4] = cg->fgfs[Sxx->sgfn];
+  matter[5] = cg->fgfs[Sxy->sgfn];
+  matter[6] = cg->fgfs[Sxz->sgfn];
+  matter[7] = cg->fgfs[Syy->sgfn];
+  matter[8] = cg->fgfs[Syz->sgfn];
+  matter[9] = cg->fgfs[Szz->sgfn];
+}
+
+bool bssn_em_cuda_use_resident_sync(int lev)
+{
+#ifdef WithShell
+  (void)lev;
+  return false;
+#else
+  return true;
+#endif
+}
+
+bool bssn_em_cuda_keep_resident_after_step(int lev, int trfls_in, int analysis_lev)
+{
+  static int keep_all_levels = -1;
+  if (keep_all_levels < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_KEEP_ALL_LEVELS");
+    keep_all_levels = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  static int enabled = -1;
+  if (enabled < 0)
+  {
+    const char *env = getenv("AMSS_CUDA_KEEP_RESIDENT_AFTER_STEP");
+    enabled = (env && atoi(env) != 0) ? 1 : 0;
+  }
+  if (!enabled)
+    return false;
+  if (lev == analysis_lev)
+    return false;
+  if (keep_all_levels)
+    return true;
+  return lev < trfls_in;
+}
+
+void bssn_em_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars,
+                                       int myrank, bool release_ctx)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
+      {
+        double *state_out[BSSN_EM_CUDA_STATE_COUNT];
+        if (!fill_bssn_em_cuda_views(cg, vars, state_out))
+        {
+          cout << "CUDA BSSN-EM resident state list mismatch during download" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (bssn_cuda_download_resident_state_count_if_present(cg, cg->shape,
+                                                                state_out,
+                                                                BSSN_EM_CUDA_STATE_COUNT))
+        {
+          cout << "CUDA BSSN-EM resident state download failed" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (release_ctx)
+          bssn_cuda_release_step_ctx(cg);
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+
+void bssn_em_cuda_keep_only_level_state(MyList<Patch> *PatL, MyList<var> *vars,
+                                        int myrank)
+{
+  MyList<Patch> *Pp = PatL;
+  while (Pp)
+  {
+    MyList<Block> *BP = Pp->data->blb;
+    while (BP)
+    {
+      Block *cg = BP->data;
+      if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
+      {
+        double *state_key[BSSN_EM_CUDA_STATE_COUNT];
+        if (!fill_bssn_em_cuda_views(cg, vars, state_key))
+        {
+          cout << "CUDA BSSN-EM resident state list mismatch during prune" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+        if (bssn_cuda_keep_only_resident_state_count(cg, cg->shape,
+                                                     state_key,
+                                                     BSSN_EM_CUDA_STATE_COUNT))
+        {
+          cout << "CUDA BSSN-EM keep-only resident state failed" << endl;
+          MPI_Abort(MPI_COMM_WORLD, 1);
+        }
+      }
+      if (BP == Pp->data->ble)
+        break;
+      BP = BP->next;
+    }
+    Pp = Pp->next;
+  }
+}
+#endif
+}
+
+// Define bssnEM_class
+
+// It inherits some members and methods from the parent class bssn_class and modifies others.
 // The modified members and methods are defined below (and in the header bssnEM_class.h).
 // The remaining members are inherited from the parent class bssn_class (declared in bssn_class.h).
 
@@ -253,12 +615,19 @@ void bssnEM_class::Initialize()
   {
     CheckPoint->read_Black_Hole_position(BH_num_input, BH_num, Porg0, Pmom, Spin, Mass, Porgbr, Porg, Porg1, Porg_rhs);
   }
-  else
-  {
-    PhysTime = StartTime;
-    Setup_Black_Hole_position();
-  }
-}
+  else
+  {
+    PhysTime = StartTime;
+    Setup_Black_Hole_position();
+  }
+
+  sync_cache_pre = new Parallel::SyncCache[GH->levels];
+  sync_cache_cor = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_coarse = new Parallel::SyncCache[GH->levels];
+  sync_cache_rp_fine = new Parallel::SyncCache[GH->levels];
+  sync_cache_restrict = new Parallel::SyncCache[GH->levels];
+  sync_cache_outbd = new Parallel::SyncCache[GH->levels];
+}
 
 //================================================================================================
 
@@ -829,14 +1198,30 @@ void bssnEM_class::Step(int lev, int YN)
   double ndeps = numepss;
   if (lev < GH->movls)
     ndeps = numepsb;
-  double TRK4 = PhysTime;
-  int iter_count = 0; // count RK4 substeps
-  int pre = 0, cor = 1;
-  int ERROR = 0;
-
-  MyList<ss_patch> *sPp;
-  // Predictor
-  MyList<Patch> *Pp = GH->PatL[lev];
+  double TRK4 = PhysTime;
+  int iter_count = 0; // count RK4 substeps
+  int pre = 0, cor = 1;
+  int ERROR = 0;
+#if USE_CUDA_BSSN
+  const bool use_cuda_resident_sync = bssn_em_cuda_use_resident_sync(lev);
+#endif
+  const bool em_step_timing = bssn_em_step_timing_enabled();
+  const double em_step_t0 = em_step_timing ? MPI_Wtime() : 0.0;
+  double em_t0 = 0.0;
+  double em_t_predictor = 0.0;
+  double em_t_predictor_sync = 0.0;
+  double em_t_corrector = 0.0;
+  double em_t_corrector_sync = 0.0;
+  double em_t_analysis = 0.0;
+  double em_t_bh = 0.0;
+  double em_t_swap = 0.0;
+  double em_t_resident = 0.0;
+  double em_t_rp = 0.0;
+
+  MyList<ss_patch> *sPp;
+  // Predictor
+  em_t0 = em_step_timing ? MPI_Wtime() : 0.0;
+  MyList<Patch> *Pp = GH->PatL[lev];
   while (Pp)
   {
     MyList<Block> *BP = Pp->data->blb;
@@ -845,16 +1230,21 @@ void bssnEM_class::Step(int lev, int YN)
       Block *cg = BP->data;
       if (myrank == cg->rank)
       {
-#if (AGM == 0)
-        f_enforce_ga(cg->shape,
-                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
-                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
-                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
-                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
-#endif
+#if !USE_CUDA_BSSN
+#if (AGM == 0)
+        f_enforce_ga(cg->shape,
+                     cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
+                     cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
+                     cg->fgfs[Axx0->sgfn], cg->fgfs[Axy0->sgfn], cg->fgfs[Axz0->sgfn], 
+                     cg->fgfs[Ayy0->sgfn], cg->fgfs[Ayz0->sgfn], cg->fgfs[Azz0->sgfn]);
+#endif
+#endif
 
-        if (
-            f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+        int em_rhs_error = 0;
+        bool used_gpu_substep = false;
+#if !USE_CUDA_BSSN
+        em_rhs_error =
+            f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                  cg->fgfs[phi0->sgfn],
                                  cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                  cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -871,10 +1261,54 @@ void bssnEM_class::Step(int lev, int YN)
                                  cg->fgfs[Kpsi_rhs->sgfn], cg->fgfs[Kphi_rhs->sgfn],
                                  cg->fgfs[rho->sgfn], 
                                  cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
-                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
-                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                 Symmetry, lev, ndeps) ||
-            f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                 cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
+                                 cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
+                                 Symmetry, lev, ndeps);
+#endif
+
+#if USE_CUDA_BSSN
+        if (!em_rhs_error)
+        {
+          double *state_in[BSSN_EM_CUDA_STATE_COUNT];
+          double *state_out[BSSN_EM_CUDA_STATE_COUNT];
+          double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
+          double propspeed[BSSN_EM_CUDA_STATE_COUNT];
+          double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
+          if (!fill_bssn_em_cuda_views(cg, StateList, state_in, propspeed, soa_flat) ||
+              !fill_bssn_em_cuda_views(cg, SynchList_pre, state_out))
+          {
+            cout << "CUDA BSSN-EM state list mismatch on predictor step" << endl;
+            MPI_Abort(MPI_COMM_WORLD, 1);
+          }
+          fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
+          int apply_bam_bc = 0;
+#if (SommerType == 0)
+#ifndef WithShell
+          apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#endif
+          int apply_enforce_ga = 0;
+#if (AGM == 0)
+          apply_enforce_ga = 1;
+#endif
+          int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
+          if (bssn_em_cuda_rk4_substep(cg,
+                                       cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                       state_in, state_out, sources,
+                                       propspeed, soa_flat, Pp->data->bbox,
+                                       dT_lev, TRK4, iter_count, apply_bam_bc,
+                                       Symmetry, lev, ndeps, pre,
+                                       keep_resident_state, apply_enforce_ga, chitiny))
+          {
+            ERROR = 1;
+          }
+          used_gpu_substep = true;
+        }
+#endif
+
+        if (em_rhs_error ||
+            (!used_gpu_substep &&
+             f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                cg->fgfs[phi0->sgfn], cg->fgfs[trK0->sgfn],
                                cg->fgfs[gxx0->sgfn], cg->fgfs[gxy0->sgfn], cg->fgfs[gxz0->sgfn], 
                                cg->fgfs[gyy0->sgfn], cg->fgfs[gyz0->sgfn], cg->fgfs[gzz0->sgfn],
@@ -907,7 +1341,7 @@ void bssnEM_class::Step(int lev, int YN)
                                cg->fgfs[Cons_Ham->sgfn],
                                cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                                cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                               Symmetry, lev, ndeps, pre))
+                               Symmetry, lev, ndeps, pre)))
         {
           cout << "find NaN in domain: (" 
                << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -916,9 +1350,11 @@ void bssnEM_class::Step(int lev, int YN)
           ERROR = 1;
         }
 
-        // rk4 substep and boundary
-        {
-          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
+        if (!used_gpu_substep)
+        {
+        // rk4 substep and boundary
+        {
+          MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varlrhs = RHSList; 
           // we do not check the correspondence here
           
           while (varl0)
@@ -955,17 +1391,20 @@ void bssnEM_class::Step(int lev, int YN)
             varl = varl->next;
             varlrhs = varlrhs->next;
           }
-        }
-        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
-      }
+        }
+        f_lowerboundset(cg->shape, cg->fgfs[phi->sgfn], chitiny);
+        }
+      }
       if (BP == Pp->data->ble)
         break;
       BP = BP->next;
-    }
-    Pp = Pp->next;
-  }
-  // check error information
-  {
+    }
+    Pp = Pp->next;
+  }
+  if (em_step_timing)
+    em_t_predictor += MPI_Wtime() - em_t0;
+  // check error information
+  {
     int erh = ERROR;
     MPI_Allreduce(&erh, &ERROR, 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
   }
@@ -1221,7 +1660,11 @@ void bssnEM_class::Step(int lev, int YN)
   }
 #endif
 
-  Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+    if (em_step_timing)
+      em_t0 = MPI_Wtime();
+    Parallel::Sync_cached(GH->PatL[lev], SynchList_pre, Symmetry, sync_cache_pre[lev]);
+    if (em_step_timing)
+      em_t_predictor_sync += MPI_Wtime() - em_t0;
 
 #ifdef WithShell
   if (lev == 0)
@@ -1241,12 +1684,14 @@ void bssnEM_class::Step(int lev, int YN)
   }
 #endif
 
-  // for black hole position
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
+  // for black hole position
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    if (em_step_timing)
+      em_t0 = MPI_Wtime();
+    compute_Porg_rhs(Porg0, Porg_rhs, Sfx0, Sfy0, Sfz0, lev);
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
       f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg[ithBH][0], Porg_rhs[ithBH][0], iter_count);
       f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg[ithBH][1], Porg_rhs[ithBH][1], iter_count);
       f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg[ithBH][2], Porg_rhs[ithBH][2], iter_count);
@@ -1269,20 +1714,28 @@ void bssnEM_class::Step(int lev, int YN)
         DG_List->insert(Sfy0);
         DG_List->insert(Sfz0);
         Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev);
-        DG_List->clearList();
-      }
-    }
-  }
-  // data analysis part
-  // Warning NOTE: the variables1 are used as temp storege room
-  if (lev == a_lev)
-  {
-    AnalysisStuff_EM(lev, dT_lev);
-  }
-  // corrector
-  for (iter_count = 1; iter_count < 4; iter_count++)
-  {
-    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
+        DG_List->clearList();
+      }
+    }
+    if (em_step_timing)
+      em_t_bh += MPI_Wtime() - em_t0;
+  }
+  // data analysis part
+  // Warning NOTE: the variables1 are used as temp storege room
+  if (lev == a_lev)
+  {
+    if (em_step_timing)
+      em_t0 = MPI_Wtime();
+    AnalysisStuff_EM(lev, dT_lev);
+    if (em_step_timing)
+      em_t_analysis += MPI_Wtime() - em_t0;
+  }
+  // corrector
+  for (iter_count = 1; iter_count < 4; iter_count++)
+  {
+    if (em_step_timing)
+      em_t0 = MPI_Wtime();
+    // for RK4: t0, t0+dt/2, t0+dt/2, t0+dt;
     if (iter_count == 1 || iter_count == 3)
       TRK4 += dT_lev / 2;
     Pp = GH->PatL[lev];
@@ -1294,10 +1747,11 @@ void bssnEM_class::Step(int lev, int YN)
         Block *cg = BP->data;
         if (myrank == cg->rank)
         {
-#if (AGM == 0)
-          f_enforce_ga(cg->shape,
-                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
-                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
+#if !USE_CUDA_BSSN
+#if (AGM == 0)
+          f_enforce_ga(cg->shape,
+                       cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
+                       cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
                        cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
                        cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
 #elif (AGM == 1)
@@ -1305,12 +1759,16 @@ void bssnEM_class::Step(int lev, int YN)
             f_enforce_ga(cg->shape,
                          cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                          cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
-                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
-                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
-#endif
+                         cg->fgfs[Axx->sgfn], cg->fgfs[Axy->sgfn], cg->fgfs[Axz->sgfn], 
+                         cg->fgfs[Ayy->sgfn], cg->fgfs[Ayz->sgfn], cg->fgfs[Azz->sgfn]);
+#endif
+#endif
 
-          if (
-              f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
+          int em_rhs_error = 0;
+          bool used_gpu_substep = false;
+#if !USE_CUDA_BSSN
+          em_rhs_error =
+              f_compute_rhs_empart(cg->shape, cg->X[0], cg->X[1], cg->X[2],
                                    cg->fgfs[phi->sgfn],
                                    cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                    cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1327,10 +1785,57 @@ void bssnEM_class::Step(int lev, int YN)
                                    cg->fgfs[Kpsi1->sgfn], cg->fgfs[Kphi1->sgfn],
                                    cg->fgfs[rho->sgfn], 
                                    cg->fgfs[Sx->sgfn], cg->fgfs[Sy->sgfn], cg->fgfs[Sz->sgfn],
-                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
-                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
-                                   Symmetry, lev, ndeps) ||
-              f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
+                                   cg->fgfs[Sxx->sgfn], cg->fgfs[Sxy->sgfn], cg->fgfs[Sxz->sgfn], 
+                                   cg->fgfs[Syy->sgfn], cg->fgfs[Syz->sgfn], cg->fgfs[Szz->sgfn],
+                                   Symmetry, lev, ndeps);
+#endif
+
+#if USE_CUDA_BSSN
+          if (!em_rhs_error)
+          {
+            double *state_in[BSSN_EM_CUDA_STATE_COUNT];
+            double *state_out[BSSN_EM_CUDA_STATE_COUNT];
+            double *sources[BSSN_EM_CUDA_SOURCE_COUNT];
+            double propspeed[BSSN_EM_CUDA_STATE_COUNT];
+            double soa_flat[3 * BSSN_EM_CUDA_STATE_COUNT];
+            if (!fill_bssn_em_cuda_views(cg, SynchList_pre, state_in, propspeed, soa_flat) ||
+                !fill_bssn_em_cuda_views(cg, SynchList_cor, state_out))
+            {
+              cout << "CUDA BSSN-EM state list mismatch on corrector step" << endl;
+              MPI_Abort(MPI_COMM_WORLD, 1);
+            }
+            fill_bssn_em_fixed_source_cuda_views(cg, sources, Jx, Jy, Jz, qchar);
+            int apply_bam_bc = 0;
+#if (SommerType == 0)
+#ifndef WithShell
+            apply_bam_bc = (lev == 0) ? 1 : 0;
+#endif
+#endif
+            int apply_enforce_ga = 0;
+#if (AGM == 0)
+            apply_enforce_ga = 1;
+#elif (AGM == 1)
+            if (iter_count == 3)
+              apply_enforce_ga = 1;
+#endif
+            int keep_resident_state = use_cuda_resident_sync ? 1 : 0;
+            if (bssn_em_cuda_rk4_substep(cg,
+                                         cg->shape, cg->X[0], cg->X[1], cg->X[2],
+                                         state_in, state_out, sources,
+                                         propspeed, soa_flat, Pp->data->bbox,
+                                         dT_lev, TRK4, iter_count, apply_bam_bc,
+                                         Symmetry, lev, ndeps, cor,
+                                         keep_resident_state, apply_enforce_ga, chitiny))
+            {
+              ERROR = 1;
+            }
+            used_gpu_substep = true;
+          }
+#endif
+
+          if (em_rhs_error ||
+              (!used_gpu_substep &&
+               f_compute_rhs_bssn(cg->shape, TRK4, cg->X[0], cg->X[1], cg->X[2],
                                  cg->fgfs[phi->sgfn], cg->fgfs[trK->sgfn],
                                  cg->fgfs[gxx->sgfn], cg->fgfs[gxy->sgfn], cg->fgfs[gxz->sgfn], 
                                  cg->fgfs[gyy->sgfn], cg->fgfs[gyz->sgfn], cg->fgfs[gzz->sgfn],
@@ -1362,7 +1867,7 @@ void bssnEM_class::Step(int lev, int YN)
                                  cg->fgfs[Cons_Ham->sgfn],
                                  cg->fgfs[Cons_Px->sgfn], cg->fgfs[Cons_Py->sgfn], cg->fgfs[Cons_Pz->sgfn],
                                  cg->fgfs[Cons_Gx->sgfn], cg->fgfs[Cons_Gy->sgfn], cg->fgfs[Cons_Gz->sgfn],
-                                 Symmetry, lev, ndeps, cor))
+                                 Symmetry, lev, ndeps, cor)))
           {
             cout << "find NaN in domain: (" 
                  << cg->bbox[0] << ":" << cg->bbox[3] << "," 
@@ -1370,9 +1875,11 @@ void bssnEM_class::Step(int lev, int YN)
                  << cg->bbox[2] << ":" << cg->bbox[5] << ")" << endl;
             ERROR = 1;
           }
-          // rk4 substep and boundary
-          {
-            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
+          if (!used_gpu_substep)
+          {
+          // rk4 substep and boundary
+          {
+            MyList<var> *varl0 = StateList, *varl = SynchList_pre, *varl1 = SynchList_cor, *varlrhs = RHSList; 
             // we do not check the correspondence here
             
             while (varl0)
@@ -1410,9 +1917,10 @@ void bssnEM_class::Step(int lev, int YN)
               varl1 = varl1->next;
               varlrhs = varlrhs->next;
             }
-          }
-          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
-        }
+          }
+          f_lowerboundset(cg->shape, cg->fgfs[phi1->sgfn], chitiny);
+          }
+        }
         if (BP == Pp->data->ble)
           break;
         BP = BP->next;
@@ -1683,7 +2191,13 @@ void bssnEM_class::Step(int lev, int YN)
     }
 #endif
 
+    if (em_step_timing)
+      em_t_corrector += MPI_Wtime() - em_t0;
+    if (em_step_timing)
+      em_t0 = MPI_Wtime();
     Parallel::Sync_cached(GH->PatL[lev], SynchList_cor, Symmetry, sync_cache_cor[lev]);
+    if (em_step_timing)
+      em_t_corrector_sync += MPI_Wtime() - em_t0;
 
 #ifdef WithShell
     if (lev == 0)
@@ -1702,12 +2216,14 @@ void bssnEM_class::Step(int lev, int YN)
       }
     }
 #endif
-    // for black hole position
-    if (BH_num > 0 && lev == GH->levels - 1)
-    {
-      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
-      for (int ithBH = 0; ithBH < BH_num; ithBH++)
-      {
+    // for black hole position
+    if (BH_num > 0 && lev == GH->levels - 1)
+    {
+      if (em_step_timing)
+        em_t0 = MPI_Wtime();
+      compute_Porg_rhs(Porg, Porg1, Sfx, Sfy, Sfz, lev);
+      for (int ithBH = 0; ithBH < BH_num; ithBH++)
+      {
         f_rungekutta4_scalar(dT_lev, Porg0[ithBH][0], Porg1[ithBH][0], Porg_rhs[ithBH][0], iter_count);
         f_rungekutta4_scalar(dT_lev, Porg0[ithBH][1], Porg1[ithBH][1], Porg_rhs[ithBH][1], iter_count);
         f_rungekutta4_scalar(dT_lev, Porg0[ithBH][2], Porg1[ithBH][2], Porg_rhs[ithBH][2], iter_count);
@@ -1730,16 +2246,20 @@ void bssnEM_class::Step(int lev, int YN)
           DG_List->insert(Sfy0);
           DG_List->insert(Sfz0);
           Parallel::Dump_Data(GH->PatL[lev], DG_List, 0, PhysTime, dT_lev);
-          DG_List->clearList();
-        }
-      }
-    }
-    // swap time level
-    if (iter_count < 3)
-    {
-      Pp = GH->PatL[lev];
-      while (Pp)
-      {
+          DG_List->clearList();
+        }
+      }
+      if (em_step_timing)
+        em_t_bh += MPI_Wtime() - em_t0;
+    }
+    // swap time level
+    if (iter_count < 3)
+    {
+      if (em_step_timing)
+        em_t0 = MPI_Wtime();
+      Pp = GH->PatL[lev];
+      while (Pp)
+      {
         MyList<Block> *BP = Pp->data->blb;
         while (BP)
         {
@@ -1777,15 +2297,33 @@ void bssnEM_class::Step(int lev, int YN)
         {
           Porg[ithBH][0] = Porg1[ithBH][0];
           Porg[ithBH][1] = Porg1[ithBH][1];
-          Porg[ithBH][2] = Porg1[ithBH][2];
-        }
-      }
-    }
-  }
-
-#if (RPS == 0)
-  // mesh refinement boundary part
-  RestrictProlong(lev, YN, BB);
+          Porg[ithBH][2] = Porg1[ithBH][2];
+        }
+      }
+      if (em_step_timing)
+        em_t_swap += MPI_Wtime() - em_t0;
+    }
+  }
+
+#if USE_CUDA_BSSN
+  if (use_cuda_resident_sync)
+  {
+    if (em_step_timing)
+      em_t0 = MPI_Wtime();
+    if (!bssn_em_cuda_keep_resident_after_step(lev, trfls, a_lev))
+      bssn_em_cuda_download_level_state(GH->PatL[lev], SynchList_cor, myrank, true);
+    if (em_step_timing)
+      em_t_resident += MPI_Wtime() - em_t0;
+  }
+#endif
+
+#if (RPS == 0)
+  // mesh refinement boundary part
+  if (em_step_timing)
+    em_t0 = MPI_Wtime();
+  RestrictProlong(lev, YN, BB);
+  if (em_step_timing)
+    em_t_rp += MPI_Wtime() - em_t0;
 
 #ifdef WithShell
   if (lev == 0)
@@ -1807,15 +2345,17 @@ void bssnEM_class::Step(int lev, int YN)
 
 #endif
   // note the data structure before update
-  // SynchList_cor 1   -----------
-  //
-  // StateList     0   -----------
-  //
-  // OldStateList  old -----------
-  // update
-  Pp = GH->PatL[lev];
-  while (Pp)
-  {
+  // SynchList_cor 1   -----------
+  //
+  // StateList     0   -----------
+  //
+  // OldStateList  old -----------
+  // update
+  if (em_step_timing)
+    em_t0 = MPI_Wtime();
+  Pp = GH->PatL[lev];
+  while (Pp)
+  {
     MyList<Block> *BP = Pp->data->blb;
     while (BP)
     {
@@ -1849,16 +2389,36 @@ void bssnEM_class::Step(int lev, int YN)
   }
 #endif
   // for black hole position
-  if (BH_num > 0 && lev == GH->levels - 1)
-  {
-    for (int ithBH = 0; ithBH < BH_num; ithBH++)
-    {
-      Porg0[ithBH][0] = Porg1[ithBH][0];
-      Porg0[ithBH][1] = Porg1[ithBH][1];
-      Porg0[ithBH][2] = Porg1[ithBH][2];
-    }
-  }
-}
+  if (BH_num > 0 && lev == GH->levels - 1)
+  {
+    for (int ithBH = 0; ithBH < BH_num; ithBH++)
+    {
+      Porg0[ithBH][0] = Porg1[ithBH][0];
+      Porg0[ithBH][1] = Porg1[ithBH][1];
+      Porg0[ithBH][2] = Porg1[ithBH][2];
+    }
+  }
+  if (em_step_timing)
+  {
+    em_t_swap += MPI_Wtime() - em_t0;
+    static int em_step_report_count = 0;
+    const int em_timing_every = bssn_em_step_timing_every();
+    const bool report_all_levels = bssn_em_step_timing_all_levels_enabled();
+    if (lev == GH->levels - 1)
+      ++em_step_report_count;
+    if ((report_all_levels || lev == GH->levels - 1) &&
+        (em_timing_every <= 1 || em_step_report_count % em_timing_every == 0))
+    {
+      fprintf(stderr,
+              "[AMSS-EM-STEP-TIMING] lev=%d wall=%.6f predictor=%.6f pre_sync=%.6f "
+              "analysis=%.6f corrector=%.6f cor_sync=%.6f bh=%.6f swap=%.6f resident=%.6f rp=%.6f\n",
+              lev, MPI_Wtime() - em_step_t0,
+              em_t_predictor, em_t_predictor_sync,
+              em_t_analysis, em_t_corrector, em_t_corrector_sync,
+              em_t_bh, em_t_swap, em_t_resident, em_t_rp);
+    }
+  }
+}
 
 //================================================================================================
 
@@ -2034,11 +2594,64 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
 {
   LastAnas += dT_lev;
   
-  if (LastAnas >= AnasTime)
-  {
-    Compute_Phi2(lev);
-    double *RP, *IP;
-    int NN = 0;
+  if (LastAnas >= AnasTime)
+  {
+#if USE_CUDA_BSSN
+    const bool zero_em_analysis =
+        bssn_em_analysis_zero_fastpath_ready(GH->PatL[lev],
+#ifdef WithShell
+                                             SH
+#else
+                                             0
+#endif
+                                             , myrank
+        );
+#else
+    const bool zero_em_analysis = false;
+#endif
+    if (zero_em_analysis)
+    {
+#if USE_CUDA_BSSN
+      zero_em_analysis_outputs(GH->PatL[lev],
+#ifdef WithShell
+                               SH,
+#else
+                               0,
+#endif
+                               myrank,
+                               Rphi2, Iphi2, Rphi1, Iphi1);
+#endif
+      int NN = 0;
+      for (int pl = 1; pl < maxl + 1; pl++)
+        for (int pm = -pl; pm < pl + 1; pm++)
+          NN++;
+      double *RP = new double[NN];
+      double *IP = new double[NN];
+      std::memset(RP, 0, NN * sizeof(double));
+      std::memset(IP, 0, NN * sizeof(double));
+      for (int i = 0; i < decn; i++)
+        Phi2Monitor->writefile(PhysTime, NN, RP, IP);
+      delete[] RP;
+      delete[] IP;
+
+      NN = 0;
+      for (int pl = 0; pl < maxl + 1; pl++)
+        for (int pm = -pl; pm < pl + 1; pm++)
+          NN++;
+      RP = new double[NN];
+      IP = new double[NN];
+      std::memset(RP, 0, NN * sizeof(double));
+      std::memset(IP, 0, NN * sizeof(double));
+      for (int i = 0; i < decn; i++)
+        Phi1Monitor->writefile(PhysTime, NN, RP, IP);
+      delete[] RP;
+      delete[] IP;
+    }
+    else
+    {
+    Compute_Phi2(lev);
+    double *RP, *IP;
+    int NN = 0;
     // for phi2
     for (int pl = 1; pl < maxl + 1; pl++)
       for (int pm = -pl; pm < pl + 1; pm++)
@@ -2121,10 +2734,11 @@ void bssnEM_class::AnalysisStuff_EM(int lev, double dT_lev)
 #endif
       Phi1Monitor->writefile(PhysTime, NN, RP, IP);
       Rex = Rex - drex;
-    }
-    delete[] RP;
-    delete[] IP;
-  }
+    }
+    delete[] RP;
+    delete[] IP;
+    }
+  }
 
   AnalysisStuff(lev, dT_lev); // LastAnas need and only need control here
   
diff --git a/AMSS_NCKU_source/bssn_class.C b/AMSS_NCKU_source/bssn_class.C
index d966a74..e4d6510 100644
--- a/AMSS_NCKU_source/bssn_class.C
+++ b/AMSS_NCKU_source/bssn_class.C
@@ -438,7 +438,7 @@ int count_bssn_cuda_state_list(MyList<var> *vars)
   {
     ++count;
     vars = vars->next;
-    if (count > BSSN_ESCALAR_CUDA_STATE_COUNT)
+    if (count > BSSN_EM_CUDA_STATE_COUNT)
       return -1;
   }
   return count;
@@ -449,8 +449,7 @@ bool fill_bssn_cuda_views_count(Block *cg, MyList<var> *vars,
                                 double **host_views)
 {
   if (!cg || !host_views ||
-      (state_count != BSSN_CUDA_STATE_COUNT &&
-       state_count != BSSN_ESCALAR_CUDA_STATE_COUNT))
+      state_count <= 0 || state_count > BSSN_EM_CUDA_STATE_COUNT)
     return false;
   int idx = 0;
   while (vars && idx < state_count)
@@ -742,7 +741,7 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
       Block *cg = BP->data;
       if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
       {
-        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+        double *state_out[BSSN_EM_CUDA_STATE_COUNT];
         if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
         {
           cout << "CUDA BSSN state list mismatch on resident state download" << endl;
@@ -750,7 +749,9 @@ void bssn_cuda_download_level_state(MyList<Patch> *PatL, MyList<var> *vars, int
         }
         const int rc = (state_count == BSSN_ESCALAR_CUDA_STATE_COUNT)
                            ? bssn_escalar_cuda_download_resident_state(cg, cg->shape, state_out)
-                           : bssn_cuda_download_resident_state(cg, cg->shape, state_out);
+                           : ((state_count == BSSN_CUDA_STATE_COUNT)
+                                  ? bssn_cuda_download_resident_state(cg, cg->shape, state_out)
+                                  : bssn_cuda_download_resident_state_count_if_present(cg, cg->shape, state_out, state_count));
         if (rc)
         {
           cout << "CUDA resident state download failed" << endl;
@@ -779,7 +780,7 @@ void bssn_cuda_download_level_state_if_present(MyList<Patch> *PatL, MyList<var>
       Block *cg = BP->data;
       if (myrank == cg->rank && bssn_cuda_has_resident_state(cg))
       {
-        double *state_out[BSSN_ESCALAR_CUDA_STATE_COUNT];
+        double *state_out[BSSN_EM_CUDA_STATE_COUNT];
         if (!fill_bssn_cuda_views_count(cg, vars, state_count, state_out))
         {
           cout << "CUDA BSSN state list mismatch on resident state conditional download" << endl;
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.cu b/AMSS_NCKU_source/bssn_rhs_cuda.cu
index 4168f71..2e18a48 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.cu
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.cu
@@ -413,7 +413,7 @@ __device__ __forceinline__ double fetch_sym_ord3_direct(const double *src,
  */
 
 /* Total number of "all"-sized slots */
-#define NUM_SLOTS 192
+#define NUM_SLOTS 256
 
 struct GpuBuffers {
     double *d_mem;          /* single big allocation */
@@ -486,6 +486,19 @@ enum {
     S_Sphi_x, S_Sphi_y, S_Sphi_z,
     S_Sphi_xx, S_Sphi_xy, S_Sphi_xz, S_Sphi_yy, S_Sphi_yz, S_Sphi_zz,
     S_trK_x, S_trK_y, S_trK_z,
+    S_EM_Kpsi, S_EM_Kphi,
+    S_EM_Ex, S_EM_Ey, S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz,
+    S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar,
+    S_EM_Kpsi_rhs, S_EM_Kphi_rhs,
+    S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs,
+    S_EM_Kpsix, S_EM_Kpsiy, S_EM_Kpsiz,
+    S_EM_Kphix, S_EM_Kphiy, S_EM_Kphiz,
+    S_EM_Exx, S_EM_Exy, S_EM_Exz,
+    S_EM_Eyx, S_EM_Eyy, S_EM_Eyz,
+    S_EM_Ezx, S_EM_Ezy, S_EM_Ezz,
+    S_EM_Bxx, S_EM_Bxy, S_EM_Bxz,
+    S_EM_Byx, S_EM_Byy, S_EM_Byz,
+    S_EM_Bzx, S_EM_Bzy, S_EM_Bzz,
     NUM_USED_SLOTS
 };
 
@@ -503,9 +516,12 @@ static constexpr int BSSN_STATE_COUNT = 24;
 static constexpr int BSSN_MATTER_COUNT = 10;
 static constexpr int BSSN_LK_FIELD_COUNT = 24;
 static constexpr int BSSN_ESCALAR_LK_FIELD_COUNT = 26;
+static constexpr int BSSN_EM_STATE_COUNT = 32;
+static constexpr int BSSN_EM_SOURCE_COUNT = 4;
+static constexpr int BSSN_EM_LK_FIELD_COUNT = 32;
 static constexpr int BSSN_RESIDENT_BANK_COUNT = 6;
 static constexpr int BSSN_ESCALAR_STATE_COUNT = 26;
-static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_ESCALAR_STATE_COUNT;
+static constexpr int BSSN_RESIDENT_STATE_CAPACITY = BSSN_EM_STATE_COUNT;
 
 static const int k_state_input_slots[BSSN_STATE_COUNT] = {
     S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
@@ -543,6 +559,29 @@ static const int k_escalar_state_rhs_slots[BSSN_ESCALAR_STATE_COUNT] = {
     S_Sphi_rhs, S_Spi_rhs
 };
 
+static const int k_em_state_input_slots[BSSN_EM_STATE_COUNT] = {
+    S_chi, S_trK, S_dxx, S_gxy, S_gxz, S_dyy, S_gyz, S_dzz,
+    S_Axx, S_Axy, S_Axz, S_Ayy, S_Ayz, S_Azz,
+    S_Gamx, S_Gamy, S_Gamz,
+    S_Lap, S_betax, S_betay, S_betaz,
+    S_dtSfx, S_dtSfy, S_dtSfz,
+    S_EM_Kpsi, S_EM_Kphi,
+    S_EM_Ex, S_EM_Ey, S_EM_Ez,
+    S_EM_Bx, S_EM_By, S_EM_Bz
+};
+
+static const int k_em_state_rhs_slots[BSSN_EM_STATE_COUNT] = {
+    S_chi_rhs, S_trK_rhs,
+    S_gxx_rhs, S_gxy_rhs, S_gxz_rhs, S_gyy_rhs, S_gyz_rhs, S_gzz_rhs,
+    S_Axx_rhs, S_Axy_rhs, S_Axz_rhs, S_Ayy_rhs, S_Ayz_rhs, S_Azz_rhs,
+    S_Gamx_rhs, S_Gamy_rhs, S_Gamz_rhs,
+    S_Lap_rhs, S_betax_rhs, S_betay_rhs, S_betaz_rhs,
+    S_dtSfx_rhs, S_dtSfy_rhs, S_dtSfz_rhs,
+    S_EM_Kpsi_rhs, S_EM_Kphi_rhs,
+    S_EM_Ex_rhs, S_EM_Ey_rhs, S_EM_Ez_rhs,
+    S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs
+};
+
 static const int k_matter_slots[BSSN_MATTER_COUNT] = {
     S_rho, S_Sx, S_Sy, S_Sz, S_Sxx, S_Sxy, S_Sxz, S_Syy, S_Syz, S_Szz
 };
@@ -603,6 +642,7 @@ struct StepContext {
     double *d_state_next_mem;
     std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
     double *d_matter_mem;
+    double *d_em_source_mem;
     double *d_comm_mem;
     double *h_comm_mem;
     std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> d_state0;
@@ -615,11 +655,16 @@ struct StepContext {
     std::array<unsigned long long, BSSN_RESIDENT_BANK_COUNT> resident_age;
     std::array<bool, BSSN_RESIDENT_BANK_COUNT> resident_valid;
     std::array<double *, BSSN_MATTER_COUNT> d_matter;
+    std::array<double *, BSSN_EM_SOURCE_COUNT> d_em_source;
+    std::array<double *, BSSN_EM_SOURCE_COUNT> em_source_host;
     size_t cap_all;
     size_t cap_comm;
     bool h_comm_pinned;
     size_t cap_h_comm;
     bool matter_ready;
+    bool em_source_ready;
+    bool em_zero_fast_known;
+    bool em_zero_fast;
     bool state_ready;
     int current_bank;
     unsigned long long resident_clock;
@@ -628,9 +673,11 @@ struct StepContext {
         : d_state0_mem(nullptr), d_accum_mem(nullptr),
           d_state_curr_mem(nullptr), d_state_next_mem(nullptr),
           d_resident_mem{},
-          d_matter_mem(nullptr), d_comm_mem(nullptr), h_comm_mem(nullptr),
+          d_matter_mem(nullptr), d_em_source_mem(nullptr),
+          d_comm_mem(nullptr), h_comm_mem(nullptr),
           cap_all(0), cap_comm(0), h_comm_pinned(false), cap_h_comm(0),
-          matter_ready(false), state_ready(false),
+          matter_ready(false), em_source_ready(false),
+          em_zero_fast_known(false), em_zero_fast(false), state_ready(false),
           current_bank(-1), resident_clock(0)
     {
         d_resident_mem.fill(nullptr);
@@ -646,6 +693,8 @@ struct StepContext {
         resident_age.fill(0);
         resident_valid.fill(false);
         d_matter.fill(nullptr);
+        d_em_source.fill(nullptr);
+        em_source_host.fill(nullptr);
     }
 };
 
@@ -654,6 +703,7 @@ struct StepAllocation {
     double *d_accum_mem;
     std::array<double *, BSSN_RESIDENT_BANK_COUNT> d_resident_mem;
     double *d_matter_mem;
+    double *d_em_source_mem;
     double *d_comm_mem;
     double *h_comm_mem;
     size_t cap_all;
@@ -666,6 +716,7 @@ static std::unordered_map<void *, StepContext> g_step_ctx;
 static std::vector<StepAllocation> g_step_pool;
 static int *g_comm_segment_meta = nullptr;
 static size_t g_comm_segment_meta_cap = 0;
+static int *g_em_zero_flag = nullptr;
 
 static StepAllocation empty_step_allocation()
 {
@@ -674,6 +725,7 @@ static StepAllocation empty_step_allocation()
     alloc.d_accum_mem = nullptr;
     alloc.d_resident_mem.fill(nullptr);
     alloc.d_matter_mem = nullptr;
+    alloc.d_em_source_mem = nullptr;
     alloc.d_comm_mem = nullptr;
     alloc.h_comm_mem = nullptr;
     alloc.cap_all = 0;
@@ -695,6 +747,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
     alloc.d_accum_mem = ctx.d_accum_mem;
     alloc.d_resident_mem = ctx.d_resident_mem;
     alloc.d_matter_mem = ctx.d_matter_mem;
+    alloc.d_em_source_mem = ctx.d_em_source_mem;
     alloc.d_comm_mem = ctx.d_comm_mem;
     alloc.h_comm_mem = ctx.h_comm_mem;
     alloc.cap_all = ctx.cap_all;
@@ -707,6 +760,7 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
     ctx.d_state_next_mem = nullptr;
     ctx.d_resident_mem.fill(nullptr);
     ctx.d_matter_mem = nullptr;
+    ctx.d_em_source_mem = nullptr;
     ctx.d_comm_mem = nullptr;
     ctx.h_comm_mem = nullptr;
     ctx.cap_all = 0;
@@ -714,6 +768,9 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
     ctx.h_comm_pinned = false;
     ctx.cap_h_comm = 0;
     ctx.matter_ready = false;
+    ctx.em_source_ready = false;
+    ctx.em_zero_fast_known = false;
+    ctx.em_zero_fast = false;
     ctx.state_ready = false;
     ctx.current_bank = -1;
     ctx.resident_clock = 0;
@@ -729,6 +786,8 @@ static StepAllocation detach_step_allocation(StepContext &ctx)
     ctx.resident_age.fill(0);
     ctx.resident_valid.fill(false);
     ctx.d_matter.fill(nullptr);
+    ctx.d_em_source.fill(nullptr);
+    ctx.em_source_host.fill(nullptr);
     return alloc;
 }
 
@@ -740,6 +799,7 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
     ctx.d_state_curr_mem = nullptr;
     ctx.d_state_next_mem = nullptr;
     ctx.d_matter_mem = alloc.d_matter_mem;
+    ctx.d_em_source_mem = alloc.d_em_source_mem;
     ctx.d_comm_mem = alloc.d_comm_mem;
     ctx.h_comm_mem = alloc.h_comm_mem;
     ctx.cap_all = alloc.cap_all;
@@ -747,6 +807,9 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
     ctx.h_comm_pinned = alloc.h_comm_pinned;
     ctx.cap_h_comm = alloc.cap_h_comm;
     ctx.matter_ready = false;
+    ctx.em_source_ready = false;
+    ctx.em_zero_fast_known = false;
+    ctx.em_zero_fast = false;
     ctx.state_ready = false;
     ctx.current_bank = -1;
     ctx.resident_clock = 0;
@@ -756,6 +819,7 @@ static void attach_step_allocation(StepContext &ctx, const StepAllocation &alloc
     }
     ctx.resident_age.fill(0);
     ctx.resident_valid.fill(false);
+    ctx.em_source_host.fill(nullptr);
 }
 
 static void recycle_step_allocation(StepAllocation &alloc)
@@ -849,6 +913,7 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
                                       BSSN_RESIDENT_STATE_CAPACITY * all * sizeof(double)));
             }
             CUDA_CHECK(cudaMalloc(&alloc.d_matter_mem, BSSN_MATTER_COUNT * all * sizeof(double)));
+            CUDA_CHECK(cudaMalloc(&alloc.d_em_source_mem, BSSN_EM_SOURCE_COUNT * all * sizeof(double)));
             alloc.cap_all = all;
         }
         attach_step_allocation(ctx, alloc);
@@ -867,6 +932,9 @@ static StepContext &ensure_step_ctx(void *block_tag, size_t all)
     for (int i = 0; i < BSSN_MATTER_COUNT; ++i) {
         ctx.d_matter[i] = ctx.d_matter_mem + (size_t)i * all;
     }
+    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
+        ctx.d_em_source[i] = ctx.d_em_source_mem + (size_t)i * all;
+    }
     return ctx;
 }
 
@@ -1569,10 +1637,10 @@ void kern_kodis(const double * __restrict__ fh,
 /*  Host wrapper helpers                                               */
 /* ================================================================== */
 struct LopsidedKodisTables {
-    const double *adv_fields[BSSN_ESCALAR_LK_FIELD_COUNT];
-    const double *ko_fields[BSSN_ESCALAR_LK_FIELD_COUNT];
-    double *rhs_fields[BSSN_ESCALAR_LK_FIELD_COUNT];
-    int soa_signs[3 * BSSN_ESCALAR_LK_FIELD_COUNT];
+    const double *adv_fields[BSSN_EM_LK_FIELD_COUNT];
+    const double *ko_fields[BSSN_EM_LK_FIELD_COUNT];
+    double *rhs_fields[BSSN_EM_LK_FIELD_COUNT];
+    int soa_signs[3 * BSSN_EM_LK_FIELD_COUNT];
 };
 
 struct FDerivTables {
@@ -1603,19 +1671,19 @@ struct Phase10RicciTables {
 };
 
 struct Rk4FinalizeTables {
-    const double *f0_fields[BSSN_STATE_COUNT];
-    double *rhs_fields[BSSN_STATE_COUNT];
-    double *accum_fields[BSSN_STATE_COUNT];
+    const double *f0_fields[BSSN_EM_STATE_COUNT];
+    double *rhs_fields[BSSN_EM_STATE_COUNT];
+    double *accum_fields[BSSN_EM_STATE_COUNT];
 };
 
 struct PatchBoundaryTables {
-    const double *src_fields[BSSN_STATE_COUNT];
-    double *dst_fields[BSSN_STATE_COUNT];
+    const double *src_fields[BSSN_EM_STATE_COUNT];
+    double *dst_fields[BSSN_EM_STATE_COUNT];
 };
 
 struct EScalarBoundaryTables {
-    const double *f0_fields[BSSN_ESCALAR_STATE_COUNT];
-    double *out_fields[BSSN_ESCALAR_STATE_COUNT];
+    const double *f0_fields[BSSN_EM_STATE_COUNT];
+    double *out_fields[BSSN_EM_STATE_COUNT];
 };
 
 static const int BLK = 128;
@@ -2731,6 +2799,190 @@ static void gpu_escalar_sources(int all)
     #undef D
 }
 
+__global__ __launch_bounds__(128, 3)
+void kern_em_rhs_sources(
+    const double * __restrict__ chi,
+    const double * __restrict__ dxx, const double * __restrict__ dxy, const double * __restrict__ dxz,
+    const double * __restrict__ dyy, const double * __restrict__ dyz, const double * __restrict__ dzz,
+    const double * __restrict__ Lap,
+    const double * __restrict__ betax, const double * __restrict__ betay, const double * __restrict__ betaz,
+    const double * __restrict__ trK,
+    const double * __restrict__ Ex, const double * __restrict__ Ey, const double * __restrict__ Ez,
+    const double * __restrict__ Bx, const double * __restrict__ By, const double * __restrict__ Bz,
+    const double * __restrict__ Kpsi, const double * __restrict__ Kphi,
+    const double * __restrict__ Jx, const double * __restrict__ Jy, const double * __restrict__ Jz,
+    const double * __restrict__ qchar,
+    const double * __restrict__ chix, const double * __restrict__ chiy, const double * __restrict__ chiz,
+    const double * __restrict__ gxxx, const double * __restrict__ gxyx, const double * __restrict__ gxzx,
+    const double * __restrict__ gyyx, const double * __restrict__ gyzx, const double * __restrict__ gzzx,
+    const double * __restrict__ gxxy, const double * __restrict__ gxyy, const double * __restrict__ gxzy,
+    const double * __restrict__ gyyy, const double * __restrict__ gyzy, const double * __restrict__ gzzy,
+    const double * __restrict__ gxxz, const double * __restrict__ gxyz, const double * __restrict__ gxzz,
+    const double * __restrict__ gyyz, const double * __restrict__ gyzz, const double * __restrict__ gzzz,
+    const double * __restrict__ Lapx, const double * __restrict__ Lapy, const double * __restrict__ Lapz,
+    const double * __restrict__ betaxx, const double * __restrict__ betaxy, const double * __restrict__ betaxz,
+    const double * __restrict__ betayx, const double * __restrict__ betayy, const double * __restrict__ betayz,
+    const double * __restrict__ betazx, const double * __restrict__ betazy, const double * __restrict__ betazz,
+    const double * __restrict__ Kpsix, const double * __restrict__ Kpsiy, const double * __restrict__ Kpsiz,
+    const double * __restrict__ Kphix, const double * __restrict__ Kphiy, const double * __restrict__ Kphiz,
+    const double * __restrict__ Exx, const double * __restrict__ Exy, const double * __restrict__ Exz,
+    const double * __restrict__ Eyx, const double * __restrict__ Eyy, const double * __restrict__ Eyz,
+    const double * __restrict__ Ezx, const double * __restrict__ Ezy, const double * __restrict__ Ezz,
+    const double * __restrict__ Bxx, const double * __restrict__ Bxy, const double * __restrict__ Bxz,
+    const double * __restrict__ Byx, const double * __restrict__ Byy, const double * __restrict__ Byz,
+    const double * __restrict__ Bzx, const double * __restrict__ Bzy, const double * __restrict__ Bzz,
+    double * __restrict__ Kpsi_rhs, double * __restrict__ Kphi_rhs,
+    double * __restrict__ Ex_rhs, double * __restrict__ Ey_rhs, double * __restrict__ Ez_rhs,
+    double * __restrict__ Bx_rhs, double * __restrict__ By_rhs, double * __restrict__ Bz_rhs,
+    double * __restrict__ rho, double * __restrict__ Sx, double * __restrict__ Sy, double * __restrict__ Sz,
+    double * __restrict__ Sxx, double * __restrict__ Sxy, double * __restrict__ Sxz,
+    double * __restrict__ Syy, double * __restrict__ Syz, double * __restrict__ Szz)
+{
+    const int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i >= d_gp.all) return;
+
+    constexpr double ONE = 1.0, TWO = 2.0, FOUR = 4.0, F3o2 = 1.5, EIT = 8.0, kappa = 1.0;
+    constexpr double PI = 3.141592653589793238462643383279502884;
+    const double alpn1 = Lap[i] + ONE;
+    const double chin1 = chi[i] + ONE;
+    const double sqc = sqrt(chin1);
+    const double chi3o2 = sqc * sqc * sqc;
+    const double gxxv = (dxx[i] + ONE) / chin1;
+    const double gxyv = dxy[i] / chin1;
+    const double gxzv = dxz[i] / chin1;
+    const double gyyv = (dyy[i] + ONE) / chin1;
+    const double gyzv = dyz[i] / chin1;
+    const double gzzv = (dzz[i] + ONE) / chin1;
+    const double gxxxv = (gxxx[i] - gxxv * chix[i]) / chin1;
+    const double gxxyv = (gxxy[i] - gxxv * chiy[i]) / chin1;
+    const double gxxzv = (gxxz[i] - gxxv * chiz[i]) / chin1;
+    const double gxyxv = (gxyx[i] - gxyv * chix[i]) / chin1;
+    const double gxyyv = (gxyy[i] - gxyv * chiy[i]) / chin1;
+    const double gxyzv = (gxyz[i] - gxyv * chiz[i]) / chin1;
+    const double gxzxv = (gxzx[i] - gxzv * chix[i]) / chin1;
+    const double gxzyv = (gxzy[i] - gxzv * chiy[i]) / chin1;
+    const double gxzzv = (gxzz[i] - gxzv * chiz[i]) / chin1;
+    const double gyyxv = (gyyx[i] - gyyv * chix[i]) / chin1;
+    const double gyyyv = (gyyy[i] - gyyv * chiy[i]) / chin1;
+    const double gyyzv = (gyyz[i] - gyyv * chiz[i]) / chin1;
+    const double gyzxv = (gyzx[i] - gyzv * chix[i]) / chin1;
+    const double gyzyv = (gyzy[i] - gyzv * chiy[i]) / chin1;
+    const double gyzzv = (gyzz[i] - gyzv * chiz[i]) / chin1;
+    const double gzzxv = (gzzx[i] - gzzv * chix[i]) / chin1;
+    const double gzzyv = (gzzy[i] - gzzv * chiy[i]) / chin1;
+    const double gzzzv = (gzzz[i] - gzzv * chiz[i]) / chin1;
+    const double det = gxxv*gyyv*gzzv + gxyv*gyzv*gxzv + gxzv*gxyv*gyzv - gxzv*gyyv*gxzv - gxyv*gxyv*gzzv - gxxv*gyzv*gyzv;
+    const double gupxx = (gyyv*gzzv - gyzv*gyzv) / det;
+    const double gupxy = -(gxyv*gzzv - gyzv*gxzv) / det;
+    const double gupxz = (gxyv*gyzv - gyyv*gxzv) / det;
+    const double gupyy = (gxxv*gzzv - gxzv*gxzv) / det;
+    const double gupyz = -(gxxv*gyzv - gxyv*gxzv) / det;
+    const double gupzz = (gxxv*gyyv - gxyv*gxyv) / det;
+
+    Ex_rhs[i] = alpn1*trK[i]*Ex[i]-(Ex[i]*betaxx[i]+Ey[i]*betaxy[i]+Ez[i]*betaxz[i])-FOUR*PI*alpn1*Jx[i]-alpn1*(gupxx*Kpsix[i]+gupxy*Kpsiy[i]+gupxz*Kpsiz[i])
+        + chi3o2*(((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapy[i]+alpn1*(gxzv*Bxy[i]+gyzv*Byy[i]+gzzv*Bzy[i])+alpn1*(Bx[i]*gxzyv+By[i]*gyzyv+Bz[i]*gzzyv))-((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapz[i]+alpn1*(gxyv*Bxz[i]+gyyv*Byz[i]+gyzv*Bzz[i])+alpn1*(Bx[i]*gxyzv+By[i]*gyyzv+Bz[i]*gyzzv)));
+    Ey_rhs[i] = alpn1*trK[i]*Ey[i]-(Ex[i]*betayx[i]+Ey[i]*betayy[i]+Ez[i]*betayz[i])-FOUR*PI*alpn1*Jy[i]-alpn1*(gupxy*Kpsix[i]+gupyy*Kpsiy[i]+gupyz*Kpsiz[i])
+        + chi3o2*(((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapz[i]+alpn1*(gxxv*Bxz[i]+gxyv*Byz[i]+gxzv*Bzz[i])+alpn1*(Bx[i]*gxxzv+By[i]*gxyzv+Bz[i]*gxzzv))-((gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i])*Lapx[i]+alpn1*(gxzv*Bxx[i]+gyzv*Byx[i]+gzzv*Bzx[i])+alpn1*(Bx[i]*gxzxv+By[i]*gyzxv+Bz[i]*gzzxv)));
+    Ez_rhs[i] = alpn1*trK[i]*Ez[i]-(Ex[i]*betazx[i]+Ey[i]*betazy[i]+Ez[i]*betazz[i])-FOUR*PI*alpn1*Jz[i]-alpn1*(gupxz*Kpsix[i]+gupyz*Kpsiy[i]+gupzz*Kpsiz[i])
+        + chi3o2*(((gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i])*Lapx[i]+alpn1*(gxyv*Bxx[i]+gyyv*Byx[i]+gyzv*Bzx[i])+alpn1*(Bx[i]*gxyxv+By[i]*gyyxv+Bz[i]*gyzxv))-((gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i])*Lapy[i]+alpn1*(gxxv*Bxy[i]+gxyv*Byy[i]+gxzv*Bzy[i])+alpn1*(Bx[i]*gxxyv+By[i]*gxyyv+Bz[i]*gxzyv)));
+    Bx_rhs[i] = alpn1*trK[i]*Bx[i]-(Bx[i]*betaxx[i]+By[i]*betaxy[i]+Bz[i]*betaxz[i])-alpn1*(gupxx*Kphix[i]+gupxy*Kphiy[i]+gupxz*Kphiz[i])
+        - chi3o2*(((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapy[i]+alpn1*(gxzv*Exy[i]+gyzv*Eyy[i]+gzzv*Ezy[i])+alpn1*(Ex[i]*gxzyv+Ey[i]*gyzyv+Ez[i]*gzzyv))-((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapz[i]+alpn1*(gxyv*Exz[i]+gyyv*Eyz[i]+gyzv*Ezz[i])+alpn1*(Ex[i]*gxyzv+Ey[i]*gyyzv+Ez[i]*gyzzv)));
+    By_rhs[i] = alpn1*trK[i]*By[i]-(Bx[i]*betayx[i]+By[i]*betayy[i]+Bz[i]*betayz[i])-alpn1*(gupxy*Kphix[i]+gupyy*Kphiy[i]+gupyz*Kphiz[i])
+        - chi3o2*(((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapz[i]+alpn1*(gxxv*Exz[i]+gxyv*Eyz[i]+gxzv*Ezz[i])+alpn1*(Ex[i]*gxxzv+Ey[i]*gxyzv+Ez[i]*gxzzv))-((gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i])*Lapx[i]+alpn1*(gxzv*Exx[i]+gyzv*Eyx[i]+gzzv*Ezx[i])+alpn1*(Ex[i]*gxzxv+Ey[i]*gyzxv+Ez[i]*gzzxv)));
+    Bz_rhs[i] = alpn1*trK[i]*Bz[i]-(Bx[i]*betazx[i]+By[i]*betazy[i]+Bz[i]*betazz[i])-alpn1*(gupxz*Kphix[i]+gupyz*Kphiy[i]+gupzz*Kphiz[i])
+        - chi3o2*(((gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i])*Lapx[i]+alpn1*(gxyv*Exx[i]+gyyv*Eyx[i]+gyzv*Ezx[i])+alpn1*(Ex[i]*gxyxv+Ey[i]*gyyxv+Ez[i]*gyzxv))-((gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i])*Lapy[i]+alpn1*(gxxv*Exy[i]+gxyv*Eyy[i]+gxzv*Ezy[i])+alpn1*(Ex[i]*gxxyv+Ey[i]*gxyyv+Ez[i]*gxzyv)));
+    Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i]-alpn1*kappa*Kpsi[i]-alpn1*(Exx[i]+Eyy[i]+Ezz[i]-F3o2/chin1*(chix[i]*Ex[i]+chiy[i]*Ey[i]+chiz[i]*Ez[i]));
+    Kphi_rhs[i] = -alpn1*kappa*Kphi[i]-alpn1*(Bxx[i]+Byy[i]+Bzz[i]-F3o2/chin1*(chix[i]*Bx[i]+chiy[i]*By[i]+chiz[i]*Bz[i]));
+
+    const double lrho = (gxxv*(Ex[i]*Ex[i]+Bx[i]*Bx[i])+gyyv*(Ey[i]*Ey[i]+By[i]*By[i])+gzzv*(Ez[i]*Ez[i]+Bz[i]*Bz[i])+TWO*(gxyv*(Ex[i]*Ey[i]+Bx[i]*By[i])+gxzv*(Ex[i]*Ez[i]+Bx[i]*Bz[i])+gyzv*(Ey[i]*Ez[i]+By[i]*Bz[i])))/EIT/PI;
+    rho[i] = lrho;
+    Sx[i] = (Ey[i]*Bz[i]-Ez[i]*By[i])/FOUR/PI/chi3o2;
+    Sy[i] = (Ez[i]*Bx[i]-Ex[i]*Bz[i])/FOUR/PI/chi3o2;
+    Sz[i] = (Ex[i]*By[i]-Ey[i]*Bx[i])/FOUR/PI/chi3o2;
+    const double lEx = gxxv*Ex[i]+gxyv*Ey[i]+gxzv*Ez[i], lEy = gxyv*Ex[i]+gyyv*Ey[i]+gyzv*Ez[i], lEz = gxzv*Ex[i]+gyzv*Ey[i]+gzzv*Ez[i];
+    const double lBx = gxxv*Bx[i]+gxyv*By[i]+gxzv*Bz[i], lBy = gxyv*Bx[i]+gyyv*By[i]+gyzv*Bz[i], lBz = gxzv*Bx[i]+gyzv*By[i]+gzzv*Bz[i];
+    Sxx[i] = lrho*gxxv-(lEx*lEx+lBx*lBx)/FOUR/PI;
+    Sxy[i] = lrho*gxyv-(lEx*lEy+lBx*lBy)/FOUR/PI;
+    Sxz[i] = lrho*gxzv-(lEx*lEz+lBx*lBz)/FOUR/PI;
+    Syy[i] = lrho*gyyv-(lEy*lEy+lBy*lBy)/FOUR/PI;
+    Syz[i] = lrho*gyzv-(lEy*lEz+lBy*lBz)/FOUR/PI;
+    Szz[i] = lrho*gzzv-(lEz*lEz+lBz*lBz)/FOUR/PI;
+}
+
+static void gpu_lopsided_kodis_em_batch(double eps_val, int all)
+{
+    LopsidedKodisTables tables = {};
+    const int adv_slots[8] = {S_EM_Kpsi, S_EM_Kphi, S_EM_Ex, S_EM_Ey,
+                              S_EM_Ez, S_EM_Bx, S_EM_By, S_EM_Bz};
+    const int rhs_slots[8] = {S_EM_Kpsi_rhs, S_EM_Kphi_rhs, S_EM_Ex_rhs, S_EM_Ey_rhs,
+                              S_EM_Ez_rhs, S_EM_Bx_rhs, S_EM_By_rhs, S_EM_Bz_rhs};
+    const int signs[24] = {
+        1, 1, 1,   1, 1, 1,
+       -1, 1, 1,   1,-1, 1,   1, 1,-1,
+        1,-1,-1,  -1, 1,-1,  -1,-1, 1
+    };
+    for (int i = 0; i < 8; ++i) {
+        tables.adv_fields[i] = g_buf.slot[adv_slots[i]];
+        tables.ko_fields[i] = g_buf.slot[adv_slots[i]];
+        tables.rhs_fields[i] = g_buf.slot[rhs_slots[i]];
+        tables.soa_signs[3*i+0] = signs[3*i+0];
+        tables.soa_signs[3*i+1] = signs[3*i+1];
+        tables.soa_signs[3*i+2] = signs[3*i+2];
+    }
+    dim3 launch_grid((unsigned int)grid((size_t)all), 8u);
+    kern_lopsided_kodis_batched<<<launch_grid, BLK>>>(
+        g_buf.slot[S_betax], g_buf.slot[S_betay], g_buf.slot[S_betaz], tables, eps_val);
+}
+
+static void gpu_em_rhs_sources(int all, double eps)
+{
+#define D(s) g_buf.slot[s]
+    double *src[] = {D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_chi),
+                     D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz),
+                     D(S_EM_Kpsi), D(S_EM_Kphi),
+                     D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz)};
+    double *fx[] = {D(S_Lapx), D(S_betaxx), D(S_betayx), D(S_betazx), D(S_chix),
+                    D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
+                    D(S_EM_Kpsix), D(S_EM_Kphix),
+                    D(S_EM_Exx), D(S_EM_Eyx), D(S_EM_Ezx), D(S_EM_Bxx), D(S_EM_Byx), D(S_EM_Bzx)};
+    double *fy[] = {D(S_Lapy), D(S_betaxy), D(S_betayy), D(S_betazy), D(S_chiy),
+                    D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
+                    D(S_EM_Kpsiy), D(S_EM_Kphiy),
+                    D(S_EM_Exy), D(S_EM_Eyy), D(S_EM_Ezy), D(S_EM_Bxy), D(S_EM_Byy), D(S_EM_Bzy)};
+    double *fz[] = {D(S_Lapz), D(S_betaxz), D(S_betayz), D(S_betazz), D(S_chiz),
+                    D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
+                    D(S_EM_Kpsiz), D(S_EM_Kphiz),
+                    D(S_EM_Exz), D(S_EM_Eyz), D(S_EM_Ezz), D(S_EM_Bxz), D(S_EM_Byz), D(S_EM_Bzz)};
+    const int soa[] = {
+        1,1,1,  -1,1,1,  1,-1,1,  1,1,-1,  1,1,1,
+        1,1,1,  -1,-1,1, -1,1,-1, 1,1,1, 1,-1,-1, 1,1,1,
+        1,1,1,  1,1,1,
+        -1,1,1, 1,-1,1, 1,1,-1, 1,-1,-1, -1,1,-1, -1,-1,1
+    };
+    gpu_fderivs_batch(19, src, fx, fy, fz, soa, all);
+    kern_em_rhs_sources<<<grid((size_t)all), BLK>>>(
+        D(S_chi), D(S_dxx), D(S_gxy), D(S_gxz), D(S_dyy), D(S_gyz), D(S_dzz),
+        D(S_Lap), D(S_betax), D(S_betay), D(S_betaz), D(S_trK),
+        D(S_EM_Ex), D(S_EM_Ey), D(S_EM_Ez), D(S_EM_Bx), D(S_EM_By), D(S_EM_Bz),
+        D(S_EM_Kpsi), D(S_EM_Kphi), D(S_EM_Jx), D(S_EM_Jy), D(S_EM_Jz), D(S_EM_qchar),
+        D(S_chix), D(S_chiy), D(S_chiz),
+        D(S_gxxx), D(S_gxyx), D(S_gxzx), D(S_gyyx), D(S_gyzx), D(S_gzzx),
+        D(S_gxxy), D(S_gxyy), D(S_gxzy), D(S_gyyy), D(S_gyzy), D(S_gzzy),
+        D(S_gxxz), D(S_gxyz), D(S_gxzz), D(S_gyyz), D(S_gyzz), D(S_gzzz),
+        D(S_Lapx), D(S_Lapy), D(S_Lapz),
+        D(S_betaxx), D(S_betaxy), D(S_betaxz), D(S_betayx), D(S_betayy), D(S_betayz),
+        D(S_betazx), D(S_betazy), D(S_betazz),
+        D(S_EM_Kpsix), D(S_EM_Kpsiy), D(S_EM_Kpsiz), D(S_EM_Kphix), D(S_EM_Kphiy), D(S_EM_Kphiz),
+        D(S_EM_Exx), D(S_EM_Exy), D(S_EM_Exz), D(S_EM_Eyx), D(S_EM_Eyy), D(S_EM_Eyz),
+        D(S_EM_Ezx), D(S_EM_Ezy), D(S_EM_Ezz), D(S_EM_Bxx), D(S_EM_Bxy), D(S_EM_Bxz),
+        D(S_EM_Byx), D(S_EM_Byy), D(S_EM_Byz), D(S_EM_Bzx), D(S_EM_Bzy), D(S_EM_Bzz),
+        D(S_EM_Kpsi_rhs), D(S_EM_Kphi_rhs), D(S_EM_Ex_rhs), D(S_EM_Ey_rhs), D(S_EM_Ez_rhs),
+        D(S_EM_Bx_rhs), D(S_EM_By_rhs), D(S_EM_Bz_rhs),
+        D(S_rho), D(S_Sx), D(S_Sy), D(S_Sz), D(S_Sxx), D(S_Sxy), D(S_Sxz), D(S_Syy), D(S_Syz), D(S_Szz));
+    gpu_lopsided_kodis_em_batch(eps, all);
+#undef D
+}
+
 __global__ void kern_rk4_finalize(const double * __restrict__ f0,
                                   double * __restrict__ frhs,
                                   double * __restrict__ accum,
@@ -2836,6 +3088,23 @@ static void gpu_escalar_rk4_finalize_batch(const StepContext &ctx,
                                           ctx.d_accum[25], dT, rk4_stage);
 }
 
+static void gpu_em_rk4_finalize_batch(const StepContext &ctx,
+                                      size_t all,
+                                      double dT,
+                                      int rk4_stage,
+                                      double chitiny)
+{
+    Rk4FinalizeTables tables = {};
+    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+        tables.f0_fields[i] = ctx.d_state0[i];
+        tables.rhs_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]];
+        tables.accum_fields[i] = ctx.d_accum[i];
+    }
+
+    dim3 launch_grid((unsigned int)grid(all), (unsigned int)BSSN_EM_STATE_COUNT);
+    kern_rk4_finalize_batched<<<launch_grid, BLK>>>(tables, dT, rk4_stage, chitiny);
+}
+
 __global__ __launch_bounds__(128, 4)
 void kern_copy_patch_boundary_batched(PatchBoundaryTables tables,
                                       int touch_xmin, int touch_xmax,
@@ -2936,6 +3205,27 @@ static void gpu_escalar_restore_patch_boundary_batch(const StepContext &ctx,
         touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
 }
 
+static void gpu_em_restore_patch_boundary_batch(const StepContext &ctx,
+                                                int all,
+                                                int touch_xmin, int touch_xmax,
+                                                int touch_ymin, int touch_ymax,
+                                                int touch_zmin, int touch_zmax)
+{
+    if (!(touch_xmin || touch_xmax || touch_ymax || touch_ymin || touch_zmin || touch_zmax))
+        return;
+
+    EScalarBoundaryTables tables = {};
+    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+        tables.f0_fields[i] = ctx.d_state0[i];
+        tables.out_fields[i] = g_buf.slot[k_em_state_rhs_slots[i]];
+    }
+
+    dim3 launch_grid((unsigned int)grid((size_t)all), (unsigned int)BSSN_EM_STATE_COUNT);
+    kern_escalar_restore_patch_boundary_batched<<<launch_grid, BLK>>>(
+        tables,
+        touch_xmin, touch_xmax, touch_ymin, touch_ymax, touch_zmin, touch_zmax);
+}
+
 __global__ void kern_enforce_ga_cuda(double * __restrict__ dxx,
                                      double * __restrict__ gxy,
                                      double * __restrict__ gxz,
@@ -5294,7 +5584,22 @@ static void bind_escalar_state_output_slots(const std::array<double *, BSSN_RESI
     }
 }
 
+static void bind_em_state_input_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
+{
+    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+        g_buf.slot[k_em_state_input_slots[i]] = state[i];
+    }
+}
+
+static void bind_em_state_output_slots(const std::array<double *, BSSN_RESIDENT_STATE_CAPACITY> &state)
+{
+    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+        g_buf.slot[k_em_state_rhs_slots[i]] = state[i];
+    }
+}
+
 static void upload_escalar_state_inputs(double **state_host, size_t all);
+static void upload_em_state_inputs(double **state_host, size_t all);
 
 static bool resident_key_matches_count(const StepContext &ctx, int bank, double **host_key, int state_count)
 {
@@ -5564,6 +5869,37 @@ static int choose_escalar_resident_bank_for_reuse(StepContext &ctx, int avoid_ba
     return best;
 }
 
+static int choose_em_resident_bank_for_reuse(StepContext &ctx, int avoid_bank, size_t all)
+{
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (b != avoid_bank && !ctx.resident_valid[b])
+            return b;
+    }
+
+    int best = -1;
+    unsigned long long best_age = 0;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (b == avoid_bank) continue;
+        if (best < 0 || ctx.resident_age[b] < best_age) {
+            best = b;
+            best_age = ctx.resident_age[b];
+        }
+    }
+    if (best < 0) best = 0;
+    writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT);
+    ctx.resident_valid[best] = false;
+    ctx.resident_host[best].fill(nullptr);
+    ctx.resident_host_clean[best].fill(0);
+    ctx.resident_age[best] = 0;
+    if (ctx.current_bank == best) {
+        ctx.current_bank = -1;
+        ctx.d_state_curr_mem = nullptr;
+        ctx.d_state_curr.fill(nullptr);
+    }
+    update_state_ready(ctx);
+    return best;
+}
+
 static void assign_resident_key_count(StepContext &ctx, int bank, double **host_key, int state_count)
 {
     for (int i = 0; i < state_count; ++i) {
@@ -5658,6 +5994,47 @@ static int ensure_escalar_resident_bank(StepContext &ctx,
     return bank;
 }
 
+static int ensure_em_resident_bank(StepContext &ctx,
+                                   double **host_key,
+                                   size_t all,
+                                   bool upload_if_missing,
+                                   int avoid_bank = -1)
+{
+    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
+        if (ctx.current_bank >= 0)
+            return ctx.current_bank;
+        return 0;
+    }
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
+    if (bank >= 0) {
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        if (!ctx.resident_valid[bank] && upload_if_missing) {
+            bind_em_state_input_slots(ctx.d_resident[bank]);
+            upload_em_state_inputs(host_key, all);
+            CUDA_CHECK(cudaDeviceSynchronize());
+            ctx.resident_valid[bank] = true;
+            set_resident_host_clean(ctx, bank, true);
+        }
+        return bank;
+    }
+
+    bank = choose_em_resident_bank_for_reuse(ctx, avoid_bank, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
+    if (upload_if_missing) {
+        bind_em_state_input_slots(ctx.d_resident[bank]);
+        upload_em_state_inputs(host_key, all);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        ctx.resident_valid[bank] = true;
+        set_resident_host_clean(ctx, bank, true);
+    } else {
+        ctx.resident_valid[bank] = false;
+        set_resident_host_clean(ctx, bank, false);
+    }
+    update_state_ready(ctx);
+    return bank;
+}
+
 static int reserve_resident_output_bank(StepContext &ctx,
                                         double **host_key,
                                         size_t all,
@@ -5698,6 +6075,26 @@ static int reserve_escalar_resident_output_bank(StepContext &ctx,
     return bank;
 }
 
+static int reserve_em_resident_output_bank(StepContext &ctx,
+                                           double **host_key,
+                                           size_t all,
+                                           int input_bank)
+{
+    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches_count(ctx, input_bank, host_key, BSSN_EM_STATE_COUNT))
+        return input_bank;
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
+    if (bank < 0)
+        bank = choose_em_resident_bank_for_reuse(ctx, input_bank, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
 static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c);
 
 static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
@@ -5706,6 +6103,12 @@ static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
                                                            int avoid_c,
                                                            size_t all);
 
+static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx,
+                                                       int avoid_a,
+                                                       int avoid_b,
+                                                       int avoid_c,
+                                                       size_t all);
+
 static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
                                                          double **host_key,
                                                          size_t all,
@@ -5740,6 +6143,40 @@ static int reserve_escalar_resident_output_bank_avoiding(StepContext &ctx,
     return bank;
 }
 
+static int reserve_em_resident_output_bank_avoiding(StepContext &ctx,
+                                                    double **host_key,
+                                                    size_t all,
+                                                    int avoid_a,
+                                                    int avoid_b,
+                                                    int avoid_c)
+{
+    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT))
+        return (ctx.current_bank >= 0) ? ctx.current_bank : 0;
+    if (resident_key_matches_count(ctx, avoid_a, host_key, BSSN_EM_STATE_COUNT))
+        return avoid_a;
+    if (resident_key_matches_count(ctx, avoid_b, host_key, BSSN_EM_STATE_COUNT))
+        return avoid_b;
+    if (resident_key_matches_count(ctx, avoid_c, host_key, BSSN_EM_STATE_COUNT))
+        return avoid_c;
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
+    if (bank < 0) {
+        for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+            if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b]) {
+                bank = b;
+                break;
+            }
+        }
+    }
+    if (bank < 0)
+        bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
+    ctx.resident_valid[bank] = false;
+    ctx.resident_age[bank] = ++ctx.resident_clock;
+    update_state_ready(ctx);
+    return bank;
+}
+
 static bool bank_is_avoided(int bank, int avoid_a, int avoid_b, int avoid_c)
 {
     return bank == avoid_a || bank == avoid_b || bank == avoid_c;
@@ -5845,6 +6282,43 @@ static int choose_escalar_resident_bank_for_reuse_avoiding(StepContext &ctx,
     return best;
 }
 
+static int choose_em_resident_bank_for_reuse_avoiding(StepContext &ctx,
+                                                       int avoid_a,
+                                                       int avoid_b,
+                                                       int avoid_c,
+                                                       size_t all)
+{
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (!bank_is_avoided(b, avoid_a, avoid_b, avoid_c) && !ctx.resident_valid[b])
+            return b;
+    }
+
+    int best = -1;
+    unsigned long long best_age = 0;
+    for (int b = 0; b < BSSN_RESIDENT_BANK_COUNT; ++b) {
+        if (bank_is_avoided(b, avoid_a, avoid_b, avoid_c)) continue;
+        if (best < 0 || ctx.resident_age[b] < best_age) {
+            best = b;
+            best_age = ctx.resident_age[b];
+        }
+    }
+    if (best < 0)
+        return choose_em_resident_bank_for_reuse(ctx, avoid_a, all);
+
+    writeback_resident_bank_count(ctx, best, all, BSSN_EM_STATE_COUNT);
+    ctx.resident_valid[best] = false;
+    ctx.resident_host[best].fill(nullptr);
+    ctx.resident_host_clean[best].fill(0);
+    ctx.resident_age[best] = 0;
+    if (ctx.current_bank == best) {
+        ctx.current_bank = -1;
+        ctx.d_state_curr_mem = nullptr;
+        ctx.d_state_curr.fill(nullptr);
+    }
+    update_state_ready(ctx);
+    return best;
+}
+
 static int ensure_resident_bank_avoiding(StepContext &ctx,
                                          double **host_key,
                                          size_t all,
@@ -5886,6 +6360,49 @@ static int ensure_resident_bank_avoiding(StepContext &ctx,
     return bank;
 }
 
+static int ensure_em_resident_bank_avoiding(StepContext &ctx,
+                                            double **host_key,
+                                            size_t all,
+                                            bool upload_if_missing,
+                                            int avoid_a,
+                                            int avoid_b,
+                                            int avoid_c)
+{
+    if (!resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
+        if (ctx.current_bank >= 0)
+            return ctx.current_bank;
+        return 0;
+    }
+
+    int bank = find_resident_bank_count(ctx, host_key, BSSN_EM_STATE_COUNT);
+    if (bank >= 0) {
+        ctx.resident_age[bank] = ++ctx.resident_clock;
+        if (!ctx.resident_valid[bank] && upload_if_missing) {
+            bind_em_state_input_slots(ctx.d_resident[bank]);
+            upload_em_state_inputs(host_key, all);
+            CUDA_CHECK(cudaDeviceSynchronize());
+            ctx.resident_valid[bank] = true;
+            set_resident_host_clean(ctx, bank, true);
+        }
+        return bank;
+    }
+
+    bank = choose_em_resident_bank_for_reuse_avoiding(ctx, avoid_a, avoid_b, avoid_c, all);
+    assign_resident_key_count(ctx, bank, host_key, BSSN_EM_STATE_COUNT);
+    if (upload_if_missing) {
+        bind_em_state_input_slots(ctx.d_resident[bank]);
+        upload_em_state_inputs(host_key, all);
+        CUDA_CHECK(cudaDeviceSynchronize());
+        ctx.resident_valid[bank] = true;
+        set_resident_host_clean(ctx, bank, true);
+    } else {
+        ctx.resident_valid[bank] = false;
+        set_resident_host_clean(ctx, bank, false);
+    }
+    update_state_ready(ctx);
+    return bank;
+}
+
 static int ensure_escalar_resident_bank_avoiding(StepContext &ctx,
                                                  double **host_key,
                                                  size_t all,
@@ -5941,6 +6458,12 @@ static int active_or_keyed_bank(StepContext &ctx,
         mark_resident_current_bank(ctx, bank);
         return bank;
     }
+    if (state_count == BSSN_EM_STATE_COUNT &&
+        resident_key_usable_count(host_key, BSSN_EM_STATE_COUNT)) {
+        int bank = ensure_em_resident_bank(ctx, host_key, all, upload_if_missing);
+        mark_resident_current_bank(ctx, bank);
+        return bank;
+    }
     if (state_count == BSSN_STATE_COUNT && resident_key_usable(host_key)) {
         int bank = ensure_resident_bank(ctx, host_key, all, upload_if_missing);
         mark_resident_current_bank(ctx, bank);
@@ -6306,6 +6829,17 @@ static void download_escalar_state_outputs(double **state_host_out, size_t all)
     CUDA_CHECK(cudaDeviceSynchronize());
 }
 
+static void download_em_state_outputs(double **state_host_out, size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(state_host_out[i],
+                                   g_buf.slot[k_em_state_rhs_slots[i]],
+                                   bytes, cudaMemcpyDeviceToHost));
+    }
+    CUDA_CHECK(cudaDeviceSynchronize());
+}
+
 static void upload_escalar_state_inputs(double **state_host, size_t all)
 {
     const size_t bytes = all * sizeof(double);
@@ -6315,6 +6849,146 @@ static void upload_escalar_state_inputs(double **state_host, size_t all)
     }
 }
 
+static void upload_em_state_inputs(double **state_host, size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(g_buf.slot[k_em_state_input_slots[i]],
+                                   state_host[i], bytes, cudaMemcpyHostToDevice));
+    }
+}
+
+static bool em_source_cache_enabled()
+{
+    static int enabled = -1;
+    if (enabled < 0) {
+        const char *env = getenv("AMSS_CUDA_EM_CACHE_SOURCES");
+        enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
+    }
+    return enabled != 0;
+}
+
+static void bind_em_fixed_source_slots(StepContext &ctx)
+{
+    const int slots[BSSN_EM_SOURCE_COUNT] = {
+        S_EM_Jx, S_EM_Jy, S_EM_Jz, S_EM_qchar
+    };
+    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i)
+        g_buf.slot[slots[i]] = ctx.d_em_source[i];
+}
+
+static bool em_fixed_source_cache_matches(const StepContext &ctx,
+                                          double **source_host)
+{
+    if (!ctx.em_source_ready || !source_host)
+        return false;
+    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
+        if (!source_host[i] || ctx.em_source_host[i] != source_host[i])
+            return false;
+    }
+    return true;
+}
+
+static void upload_em_fixed_sources(StepContext &ctx,
+                                    double **source_host,
+                                    size_t all)
+{
+    const size_t bytes = all * sizeof(double);
+    bind_em_fixed_source_slots(ctx);
+    if (em_source_cache_enabled() &&
+        em_fixed_source_cache_matches(ctx, source_host)) {
+        return;
+    }
+    for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i) {
+        CUDA_CHECK(cudaMemcpyAsync(ctx.d_em_source[i],
+                                   source_host[i], bytes, cudaMemcpyHostToDevice));
+    }
+    if (em_source_cache_enabled()) {
+        for (int i = 0; i < BSSN_EM_SOURCE_COUNT; ++i)
+            ctx.em_source_host[i] = source_host[i];
+        ctx.em_source_ready = true;
+    } else {
+        ctx.em_source_host.fill(nullptr);
+        ctx.em_source_ready = false;
+    }
+}
+
+__global__ void kern_check_em_zero_fields(int *flag,
+                                          const double * __restrict__ Kpsi,
+                                          const double * __restrict__ Kphi,
+                                          const double * __restrict__ Ex,
+                                          const double * __restrict__ Ey,
+                                          const double * __restrict__ Ez,
+                                          const double * __restrict__ Bx,
+                                          const double * __restrict__ By,
+                                          const double * __restrict__ Bz,
+                                          const double * __restrict__ Jx,
+                                          const double * __restrict__ Jy,
+                                          const double * __restrict__ Jz,
+                                          const double * __restrict__ qchar,
+                                          int all)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x;
+         i < all;
+         i += blockDim.x * gridDim.x)
+    {
+        if (Kpsi[i] != 0.0 || Kphi[i] != 0.0 ||
+            Ex[i] != 0.0 || Ey[i] != 0.0 || Ez[i] != 0.0 ||
+            Bx[i] != 0.0 || By[i] != 0.0 || Bz[i] != 0.0 ||
+            Jx[i] != 0.0 || Jy[i] != 0.0 || Jz[i] != 0.0 ||
+            qchar[i] != 0.0)
+        {
+            atomicExch(flag, 0);
+        }
+    }
+}
+
+static bool em_zero_fast_path_enabled()
+{
+    static int enabled = -1;
+    if (enabled < 0) {
+        const char *env = getenv("AMSS_CUDA_EM_ZERO_FASTPATH");
+        enabled = env ? ((atoi(env) != 0) ? 1 : 0) : 1;
+    }
+    return enabled != 0;
+}
+
+static bool em_detect_zero_fast_path(StepContext &ctx,
+                                     int input_bank,
+                                     size_t all)
+{
+    if (!em_zero_fast_path_enabled())
+        return false;
+    if (ctx.em_zero_fast_known)
+        return ctx.em_zero_fast;
+    if (input_bank < 0 || input_bank >= BSSN_RESIDENT_BANK_COUNT)
+        return false;
+
+    if (!g_em_zero_flag)
+        CUDA_CHECK(cudaMalloc(&g_em_zero_flag, sizeof(int)));
+
+    int h_flag = 1;
+    CUDA_CHECK(cudaMemcpy(g_em_zero_flag, &h_flag, sizeof(int), cudaMemcpyHostToDevice));
+    kern_check_em_zero_fields<<<grid(all), BLK>>>(
+        g_em_zero_flag,
+        ctx.d_resident[input_bank][24], ctx.d_resident[input_bank][25],
+        ctx.d_resident[input_bank][26], ctx.d_resident[input_bank][27],
+        ctx.d_resident[input_bank][28], ctx.d_resident[input_bank][29],
+        ctx.d_resident[input_bank][30], ctx.d_resident[input_bank][31],
+        ctx.d_em_source[0], ctx.d_em_source[1], ctx.d_em_source[2], ctx.d_em_source[3],
+        (int)all);
+    CUDA_CHECK(cudaMemcpy(&h_flag, g_em_zero_flag, sizeof(int), cudaMemcpyDeviceToHost));
+    ctx.em_zero_fast = (h_flag != 0);
+    ctx.em_zero_fast_known = true;
+    return ctx.em_zero_fast;
+}
+
+static void zero_em_output_slots_async(size_t all)
+{
+    for (int i = BSSN_STATE_COUNT; i < BSSN_EM_STATE_COUNT; ++i)
+        CUDA_CHECK(cudaMemsetAsync(g_buf.slot[k_em_state_rhs_slots[i]], 0, all * sizeof(double)));
+}
+
 static void download_constraint_outputs(double **constraint_host_out, size_t all)
 {
     const size_t bytes = all * sizeof(double);
@@ -7342,6 +8016,10 @@ static void upload_resident_state_count(void *block_tag, int *ex, double **state
         bank = ensure_escalar_resident_bank(ctx, state_host_in, all, false);
         bind_escalar_state_input_slots(ctx.d_resident[bank]);
         upload_escalar_state_inputs(state_host_in, all);
+    } else if (state_count == BSSN_EM_STATE_COUNT) {
+        bank = ensure_em_resident_bank(ctx, state_host_in, all, false);
+        bind_em_state_input_slots(ctx.d_resident[bank]);
+        upload_em_state_inputs(state_host_in, all);
     } else if (state_count == BSSN_STATE_COUNT) {
         bank = ensure_resident_bank(ctx, state_host_in, all, false);
         bind_state_input_slots(ctx.d_resident[bank]);
@@ -8006,9 +8684,10 @@ int bssn_cuda_rk4_substep(void *block_tag,
         CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, state0_src,
                               (size_t)BSSN_STATE_COUNT * bytes,
                               cudaMemcpyDeviceToDevice));
-    } else if (!ctx.matter_ready) {
-        if (use_zero_matter) zero_matter_cache(ctx, all);
-        else upload_matter_cache(ctx, matter_host, all);
+    } else if (use_zero_matter) {
+        if (!ctx.matter_ready) zero_matter_cache(ctx, all);
+    } else {
+        upload_matter_cache(ctx, matter_host, all);
     }
     bind_matter_slots(ctx);
     if (profile) {
@@ -8083,6 +8762,203 @@ int bssn_cuda_rk4_substep(void *block_tag,
     return 0;
 }
 
+extern "C"
+int bssn_em_cuda_rk4_substep(void *block_tag,
+                             int *ex, double *X, double *Y, double *Z,
+                             double **state_host_in,
+                             double **state_host_out,
+                             double **source_host,
+                             const double *propspeed,
+                             const double *soa_flat,
+                             const double *bbox,
+                             double &dT,
+                             double &T,
+                             int &RK4,
+                             int &apply_bam_bc,
+                             int &Symmetry,
+                             int &Lev,
+                             double &eps,
+                             int &co,
+                             int &keep_resident_state,
+                             int &apply_enforce_ga,
+                             double &chitiny)
+{
+    (void)T;
+    if (RK4 < 0 || RK4 > 3) return 1;
+
+    init_gpu_dispatch();
+    CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
+    const bool profile = cuda_profile_enabled();
+    const double t_total0 = profile ? cuda_profile_now_ms() : 0.0;
+    double state_ms = 0.0;
+    double em_ms = 0.0;
+    double rhs_ms = 0.0;
+    double bc_ms = 0.0;
+    double finalize_ms = 0.0;
+    double output_ms = 0.0;
+
+    const size_t all = (size_t)ex[0] * ex[1] * ex[2];
+    const size_t bytes = all * sizeof(double);
+    int touch_xmin = 0, touch_xmax = 0;
+    int touch_ymin = 0, touch_ymax = 0;
+    int touch_zmin = 0, touch_zmax = 0;
+
+    setup_grid_params(ex, X, Y, Z, Symmetry, eps, co);
+    if (Lev > 0) {
+        compute_patch_boundary_flags(ex, X, Y, Z, bbox, Symmetry,
+                                     touch_xmin, touch_xmax,
+                                     touch_ymin, touch_ymax,
+                                     touch_zmin, touch_zmax);
+    }
+
+    StepContext &ctx = ensure_step_ctx(block_tag, all);
+    double t0 = profile ? cuda_profile_now_ms() : 0.0;
+    const bool use_resident_state = (keep_resident_state != 0);
+    int input_bank = -1;
+    int output_bank = -1;
+    if (use_resident_state) {
+        input_bank = ensure_em_resident_bank(ctx, state_host_in, all, true);
+        output_bank = reserve_em_resident_output_bank(ctx, state_host_out, all, input_bank);
+        mark_resident_current_bank(ctx, input_bank);
+        mark_resident_next_bank(ctx, output_bank);
+        bind_em_state_input_slots(ctx.d_resident[input_bank]);
+        bind_em_state_output_slots(ctx.d_resident[output_bank]);
+    } else {
+        upload_em_state_inputs(state_host_in, all);
+    }
+    upload_em_fixed_sources(ctx, source_host, all);
+    const bool use_em_zero_fast =
+        use_resident_state && em_detect_zero_fast_path(ctx, input_bank, all);
+
+    if (apply_enforce_ga) {
+        kern_enforce_ga_cuda<<<grid(all), BLK>>>(g_buf.slot[S_dxx], g_buf.slot[S_gxy], g_buf.slot[S_gxz],
+                                                 g_buf.slot[S_dyy], g_buf.slot[S_gyz], g_buf.slot[S_dzz],
+                                                 g_buf.slot[S_Axx], g_buf.slot[S_Axy], g_buf.slot[S_Axz],
+                                                 g_buf.slot[S_Ayy], g_buf.slot[S_Ayz], g_buf.slot[S_Azz]);
+        if (use_resident_state && input_bank >= 0)
+            set_resident_host_clean(ctx, input_bank, false);
+    }
+    if (profile) {
+        cuda_profile_sync();
+        state_ms += cuda_profile_now_ms() - t0;
+    }
+
+    t0 = profile ? cuda_profile_now_ms() : 0.0;
+    if (RK4 == 0) {
+        if (use_resident_state) {
+            CUDA_CHECK(cudaMemcpy(ctx.d_state0_mem, ctx.d_resident_mem[input_bank],
+                                  (size_t)BSSN_EM_STATE_COUNT * bytes,
+                                  cudaMemcpyDeviceToDevice));
+        } else {
+            for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+                CUDA_CHECK(cudaMemcpy(ctx.d_state0[i], g_buf.slot[k_em_state_input_slots[i]],
+                                      bytes, cudaMemcpyDeviceToDevice));
+            }
+        }
+    }
+    if (profile) {
+        cuda_profile_sync();
+        state_ms += cuda_profile_now_ms() - t0;
+    }
+
+    t0 = profile ? cuda_profile_now_ms() : 0.0;
+    if (use_em_zero_fast) {
+        zero_matter_cache(ctx, all);
+        bind_matter_slots(ctx);
+        zero_em_output_slots_async(all);
+    } else {
+        gpu_em_rhs_sources((int)all, eps);
+    }
+    if (profile) {
+        cuda_profile_sync();
+        em_ms += cuda_profile_now_ms() - t0;
+    }
+
+    t0 = profile ? cuda_profile_now_ms() : 0.0;
+    launch_rhs_pipeline((int)all, eps, co);
+    if (profile) {
+        cuda_profile_sync();
+        rhs_ms += cuda_profile_now_ms() - t0;
+    }
+
+    t0 = profile ? cuda_profile_now_ms() : 0.0;
+    if (apply_bam_bc) {
+        for (int i = 0; i < BSSN_EM_STATE_COUNT; ++i) {
+            gpu_sommerfeld_routbam(g_buf.slot[k_em_state_input_slots[i]],
+                                   g_buf.slot[k_em_state_rhs_slots[i]],
+                                   propspeed[i],
+                                   soa_flat[3 * i + 0],
+                                   soa_flat[3 * i + 1],
+                                   soa_flat[3 * i + 2],
+                                   X, Y, Z, bbox, Symmetry);
+        }
+    }
+    if (profile) {
+        cuda_profile_sync();
+        bc_ms += cuda_profile_now_ms() - t0;
+    }
+
+    t0 = profile ? cuda_profile_now_ms() : 0.0;
+    if (use_em_zero_fast)
+        gpu_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
+    else
+        gpu_em_rk4_finalize_batch(ctx, all, dT, RK4, chitiny);
+    if (Lev > 0) {
+        if (use_em_zero_fast)
+            gpu_copy_patch_boundary_batch((int)all,
+                                          touch_xmin, touch_xmax,
+                                          touch_ymin, touch_ymax,
+                                          touch_zmin, touch_zmax);
+        else
+            gpu_em_restore_patch_boundary_batch(ctx, (int)all,
+                                                touch_xmin, touch_xmax,
+                                                touch_ymin, touch_ymax,
+                                                touch_zmin, touch_zmax);
+    }
+    if (profile) {
+        cuda_profile_sync();
+        finalize_ms += cuda_profile_now_ms() - t0;
+    }
+
+    t0 = profile ? cuda_profile_now_ms() : 0.0;
+    if (use_resident_state) {
+        ctx.resident_valid[output_bank] = true;
+        ctx.resident_age[output_bank] = ++ctx.resident_clock;
+        set_resident_host_clean(ctx, output_bank, false);
+        mark_resident_current_bank(ctx, output_bank);
+        update_state_ready(ctx);
+    } else {
+        download_em_state_outputs(state_host_out, all);
+    }
+    if (profile) {
+        cuda_profile_sync();
+        output_ms += cuda_profile_now_ms() - t0;
+        CudaProfileStats &stats = cuda_profile_stats();
+        stats.calls++;
+        stats.total_ms += cuda_profile_now_ms() - t_total0;
+        stats.state_ms += state_ms;
+        stats.matter_ms += em_ms;
+        stats.rhs_ms += rhs_ms;
+        stats.bc_ms += bc_ms;
+        stats.finalize_ms += finalize_ms;
+        stats.output_ms += output_ms;
+        cuda_profile_maybe_log();
+    }
+    if (RK4 == 3)
+        ctx.matter_ready = false;
+    return 0;
+}
+
+extern "C"
+int bssn_em_cuda_resident_zero_fast_state(void *block_tag)
+{
+    auto it = g_step_ctx.find(block_tag);
+    if (it == g_step_ctx.end())
+        return 0;
+    const StepContext &ctx = it->second;
+    return (ctx.em_zero_fast_known && ctx.em_zero_fast) ? 1 : 0;
+}
+
 extern "C"
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                         int state_index,
@@ -8143,7 +9019,7 @@ int bssn_cuda_upload_resident_state_count(void *block_tag,
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
         return 1;
     upload_resident_state_count(block_tag, ex, state_host_in, state_count);
     return 0;
@@ -8166,7 +9042,7 @@ int bssn_cuda_keep_only_resident_state_count(void *block_tag,
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
         return 1;
     keep_only_resident_state_count(block_tag, ex, state_host_key, state_count);
     return 0;
@@ -8189,7 +9065,7 @@ int bssn_cuda_download_resident_state_count_if_present(void *block_tag,
 {
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
-    if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
         return 1;
     download_resident_state_count_if_present(block_tag, ex, state_host_out, state_count);
     return 0;
@@ -8392,7 +9268,7 @@ int bssn_cuda_unpack_state_region_from_host_buffer_for_host_views(void *block_ta
     init_gpu_dispatch();
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
     if (!state_host_key ||
-        (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT))
+        state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
         return 1;
     copy_state_region_packed_cuda(block_tag, state_index, host_buffer, ex,
                                   i0, j0, k0, sx, sy, sz,
@@ -8986,7 +9862,7 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
     CUDA_CHECK(cudaSetDevice(g_dispatch.my_device));
     const bool profile = cuda_aux_profile_enabled();
     const double t0 = profile ? cuda_profile_now_ms() : 0.0;
-    if (state_count != BSSN_STATE_COUNT && state_count != BSSN_ESCALAR_STATE_COUNT)
+    if (state_count <= 0 || state_count > BSSN_RESIDENT_STATE_CAPACITY)
         return 1;
     if (source_count != 2 && source_count != 3) return 1;
     if (!resident_key_usable_count(src1_host_key, state_count) ||
@@ -9029,6 +9905,15 @@ int bssn_cuda_prepare_inter_time_level(void *block_tag,
                   : -1;
         dst_bank = reserve_escalar_resident_output_bank_avoiding(ctx, dst_host_key, all,
                                                                  src1_bank, src2_bank, src3_bank);
+    } else if (state_count == BSSN_EM_STATE_COUNT) {
+        src1_bank = ensure_em_resident_bank(ctx, src1_host_key, all, true);
+        src2_bank = ensure_em_resident_bank(ctx, src2_host_key, all, true, src1_bank);
+        src3_bank = (source_count == 3)
+                  ? ensure_em_resident_bank_avoiding(ctx, src3_host_key, all, true,
+                                                     src1_bank, src2_bank, -1)
+                  : -1;
+        dst_bank = reserve_em_resident_output_bank_avoiding(ctx, dst_host_key, all,
+                                                            src1_bank, src2_bank, src3_bank);
     } else {
         src1_bank = ensure_resident_bank(ctx, src1_host_key, all, true);
         src2_bank = ensure_resident_bank(ctx, src2_host_key, all, true, src1_bank);
diff --git a/AMSS_NCKU_source/bssn_rhs_cuda.h b/AMSS_NCKU_source/bssn_rhs_cuda.h
index 946137d..9995bb5 100644
--- a/AMSS_NCKU_source/bssn_rhs_cuda.h
+++ b/AMSS_NCKU_source/bssn_rhs_cuda.h
@@ -8,6 +8,8 @@ extern "C" {
 enum {
     BSSN_CUDA_STATE_COUNT = 24,
     BSSN_ESCALAR_CUDA_STATE_COUNT = 26,
+    BSSN_EM_CUDA_STATE_COUNT = 32,
+    BSSN_EM_CUDA_SOURCE_COUNT = 4,
     BSSN_CUDA_MATTER_COUNT = 10
 };
 
@@ -82,6 +84,28 @@ int bssn_escalar_cuda_compute_constraints(int *ex, double *X, double *Y, double
                                           int &Lev,
                                           double &eps);
 
+int bssn_em_cuda_rk4_substep(void *block_tag,
+                             int *ex, double *X, double *Y, double *Z,
+                             double **state_host_in,
+                             double **state_host_out,
+                             double **source_host,
+                             const double *propspeed,
+                             const double *soa_flat,
+                             const double *bbox,
+                             double &dT,
+                             double &T,
+                             int &RK4,
+                             int &apply_bam_bc,
+                             int &Symmetry,
+                             int &Lev,
+                             double &eps,
+                             int &co,
+                             int &keep_resident_state,
+                             int &apply_enforce_ga,
+                             double &chitiny);
+
+int bssn_em_cuda_resident_zero_fast_state(void *block_tag);
+
 int bssn_cuda_copy_state_region_to_host(void *block_tag,
                                         int state_index,
                                         double *host_state,
diff --git a/makefile_and_run.py b/makefile_and_run.py
index c3ab492..494ea56 100755
--- a/makefile_and_run.py
+++ b/makefile_and_run.py
@@ -150,8 +150,11 @@ def _gpu_runtime_env():
         "AMSS_CUDA_KEEP_ALL_LEVELS": "1",
         "AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP": "1",
         "AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS": "1",
+        "AMSS_CUDA_EM_CACHE_SOURCES": "1",
+        "AMSS_CUDA_EM_ZERO_FASTPATH": "1",
+        "AMSS_EM_ZERO_ANALYSIS_FASTPATH": "1",
         "AMSS_CUDA_AMR_HOST_STAGED": "1",
-        "AMSS_CUDA_AMR_RESTRICT_DEVICE": "1",
+        "AMSS_CUDA_AMR_RESTRICT_DEVICE": "0",
         "AMSS_CUDA_AMR_RESTRICT_BATCH": "0",
         "AMSS_CUDA_DEVICE_SEGMENT_BATCH": "0",
         "AMSS_CUDA_UNCACHED_DEVICE_BUFFERS": "1",
@@ -287,6 +290,9 @@ def run_ABE():
         print(f"   AMSS_CUDA_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_KEEP_ALL_LEVELS', '')}")
         print(f"   AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_RESIDENT_AFTER_STEP', '')}")
         print(f"   AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS={mpi_env.get('AMSS_CUDA_ESCALAR_KEEP_ALL_LEVELS', '')}")
+        print(f"   AMSS_CUDA_EM_CACHE_SOURCES={mpi_env.get('AMSS_CUDA_EM_CACHE_SOURCES', '')}")
+        print(f"   AMSS_CUDA_EM_ZERO_FASTPATH={mpi_env.get('AMSS_CUDA_EM_ZERO_FASTPATH', '')}")
+        print(f"   AMSS_EM_ZERO_ANALYSIS_FASTPATH={mpi_env.get('AMSS_EM_ZERO_ANALYSIS_FASTPATH', '')}")
         print(f"   AMSS_CUDA_AMR_HOST_STAGED={mpi_env.get('AMSS_CUDA_AMR_HOST_STAGED', '')}")
         print(f"   AMSS_CUDA_AMR_RESTRICT_DEVICE={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_DEVICE', '')}")
         print(f"   AMSS_CUDA_AMR_RESTRICT_BATCH={mpi_env.get('AMSS_CUDA_AMR_RESTRICT_BATCH', '')}")