Add C kernel for BSSN-EM (Maxwell/electromagnetic field) RHS computation

New bssn_em_rhs_c.C computes EM field RHS (E,B,Kpsi,Kphi) and stress-energy tensor, then calls the C BSSN RHS kernel with source terms. Replaces empart.f90 when USE_CXX_EM_KERNEL=1. Supports all ghost_width orders via existing derivative kernels. Controlled by USE_CXX_EM_KERNEL switch (default 0, experimental). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 11:25:08 +08:00
parent 08a6ee0120
commit a50c7214b9
4 changed files with 464 additions and 72 deletions
--- a/AMSS_NCKU_source/bssn_em_rhs_c.C
+++ b/AMSS_NCKU_source/bssn_em_rhs_c.C
@@ -0,0 +1,323 @@
 #include "macrodef.h"
 #include "bssn_rhs.h"
 #include "share_func.h"
 #include "tool.h"
 #include <cstddef>
 /*
 * C 版 BSSN-EM RHS kernel — replaces empart.f90 + bssn_rhs.f90 for BSSN+Maxwell.
 *
 * Computes:
 *   1. All metric and EM field derivatives
 *   2. Physical metric, Christoffel-like terms
 *   3. EM field RHS (E, B, Kpsi, Kphi)
 *   4. Stress-energy tensor (rho, Si, Sij)
 *   5. Calls f_compute_rhs_bssn (C BSSN RHS) with stress-energy
 *   6. Advection + KO dissipation for EM fields
 *   7. NaN check
 */
 int f_compute_rhs_bssn_em_c(int *ex, double &T,
                            double *X, double *Y, double *Z,
                            double *chi, double *trK,
                            double *dxx, double *gxy, double *gxz, double *dyy, double *gyz, double *dzz,
                            double *Axx, double *Axy, double *Axz, double *Ayy, double *Ayz, double *Azz,
                            double *Gamx, double *Gamy, double *Gamz,
                            double *Lap, double *betax, double *betay, double *betaz,
                            double *dtSfx, double *dtSfy, double *dtSfz,
                            double *Ex,  double *Ey,  double *Ez,
                            double *Bx,  double *By,  double *Bz,
                            double *Kpsi, double *Kphi,
                            double *Jx, double *Jy, double *Jz, double *qchar,
                            double *chi_rhs, double *trK_rhs,
                            double *gxx_rhs, double *gxy_rhs, double *gxz_rhs,
                            double *gyy_rhs, double *gyz_rhs, double *gzz_rhs,
                            double *Axx_rhs, double *Axy_rhs, double *Axz_rhs,
                            double *Ayy_rhs, double *Ayz_rhs, double *Azz_rhs,
                            double *Gamx_rhs, double *Gamy_rhs, double *Gamz_rhs,
                            double *Lap_rhs, double *betax_rhs, double *betay_rhs, double *betaz_rhs,
                            double *dtSfx_rhs, double *dtSfy_rhs, double *dtSfz_rhs,
                            double *Ex_rhs,  double *Ey_rhs,  double *Ez_rhs,
                            double *Bx_rhs,  double *By_rhs,  double *Bz_rhs,
                            double *Kpsi_rhs, double *Kphi_rhs,
                            double *rho, double *Sx, double *Sy, double *Sz,
                            double *Sxx, double *Sxy, double *Sxz,
                            double *Syy, double *Syz, double *Szz,
                            double *Gamxxx, double *Gamxxy, double *Gamxxz,
                            double *Gamxyy, double *Gamxyz, double *Gamxzz,
                            double *Gamyxx, double *Gamyxy, double *Gamyxz,
                            double *Gamyyy, double *Gamyyz, double *Gamyzz,
                            double *Gamzxx, double *Gamzxy, double *Gamzxz,
                            double *Gamzyy, double *Gamzyz, double *Gamzzz,
                            double *Rxx, double *Rxy, double *Rxz,
                            double *Ryy, double *Ryz, double *Rzz,
                            double *ham_Res, double *movx_Res, double *movy_Res, double *movz_Res,
                            double *Gmx_Res, double *Gmy_Res, double *Gmz_Res,
                            int &Symmetry, int &Lev, double &eps, int &co)
 {
    (void)T;
    int gont = 0;
    const int nx = ex[0], ny = ex[1], nz = ex[2];
    const int all = nx * ny * nz;
    const size_t n = (size_t)all;
    const double ZEO = 0.0, ONE = 1.0, TWO = 2.0, FOUR = 4.0, EIT = 8.0;
    const double HALF = 0.5, THR = 3.0, F3o2 = 1.5, PI = 3.14159265358979323846;
    const double SYM = 1.0, ANTI = -1.0;
    const double kappa = 1.0;
    const double SSS[3]={SYM,SYM,SYM},   AAS[3]={ANTI,ANTI,SYM};
    const double ASA[3]={ANTI,SYM,ANTI}, SAA[3]={SYM,ANTI,ANTI};
    const double ASS[3]={ANTI,SYM,SYM},   SAS[3]={SYM,ANTI,SYM};
    const double SSA[3]={SYM,SYM,ANTI};
    /* ---- allocate temporary arrays ---- */
    double *chix = (double*)malloc(n*sizeof(double));
    double *chiy = (double*)malloc(n*sizeof(double));
    double *chiz = (double*)malloc(n*sizeof(double));
    double *Exx=(double*)malloc(n*sizeof(double)),*Exy=(double*)malloc(n*sizeof(double)),*Exz=(double*)malloc(n*sizeof(double));
    double *Eyx=(double*)malloc(n*sizeof(double)),*Eyy=(double*)malloc(n*sizeof(double)),*Eyz=(double*)malloc(n*sizeof(double));
    double *Ezx=(double*)malloc(n*sizeof(double)),*Ezy=(double*)malloc(n*sizeof(double)),*Ezz=(double*)malloc(n*sizeof(double));
    double *Bxx=(double*)malloc(n*sizeof(double)),*Bxy=(double*)malloc(n*sizeof(double)),*Bxz=(double*)malloc(n*sizeof(double));
    double *Byx=(double*)malloc(n*sizeof(double)),*Byy=(double*)malloc(n*sizeof(double)),*Byz=(double*)malloc(n*sizeof(double));
    double *Bzx=(double*)malloc(n*sizeof(double)),*Bzy=(double*)malloc(n*sizeof(double)),*Bzz=(double*)malloc(n*sizeof(double));
    double *Kpsix=(double*)malloc(n*sizeof(double)),*Kpsiy=(double*)malloc(n*sizeof(double)),*Kpsiz=(double*)malloc(n*sizeof(double));
    double *Kphix=(double*)malloc(n*sizeof(double)),*Kphiy=(double*)malloc(n*sizeof(double)),*Kphiz=(double*)malloc(n*sizeof(double));
    double *Lapx=(double*)malloc(n*sizeof(double)),*Lapy=(double*)malloc(n*sizeof(double)),*Lapz=(double*)malloc(n*sizeof(double));
    double *betaxx=(double*)malloc(n*sizeof(double)),*betaxy=(double*)malloc(n*sizeof(double)),*betaxz=(double*)malloc(n*sizeof(double));
    double *betayx=(double*)malloc(n*sizeof(double)),*betayy=(double*)malloc(n*sizeof(double)),*betayz=(double*)malloc(n*sizeof(double));
    double *betazx=(double*)malloc(n*sizeof(double)),*betazy=(double*)malloc(n*sizeof(double)),*betazz=(double*)malloc(n*sizeof(double));
    double *gxxx=(double*)malloc(n*sizeof(double)),*gxxy=(double*)malloc(n*sizeof(double)),*gxxz=(double*)malloc(n*sizeof(double));
    double *gxyx=(double*)malloc(n*sizeof(double)),*gxyy=(double*)malloc(n*sizeof(double)),*gxyz=(double*)malloc(n*sizeof(double));
    double *gxzx=(double*)malloc(n*sizeof(double)),*gxzy=(double*)malloc(n*sizeof(double)),*gxzz=(double*)malloc(n*sizeof(double));
    double *gyyx=(double*)malloc(n*sizeof(double)),*gyyy=(double*)malloc(n*sizeof(double)),*gyyz=(double*)malloc(n*sizeof(double));
    double *gyzx=(double*)malloc(n*sizeof(double)),*gyzy=(double*)malloc(n*sizeof(double)),*gyzz=(double*)malloc(n*sizeof(double));
    double *gzzx=(double*)malloc(n*sizeof(double)),*gzzy=(double*)malloc(n*sizeof(double)),*gzzz=(double*)malloc(n*sizeof(double));
    double *gupxx=(double*)malloc(n*sizeof(double)),*gupxy=(double*)malloc(n*sizeof(double)),*gupxz=(double*)malloc(n*sizeof(double));
    double *gupyy=(double*)malloc(n*sizeof(double)),*gupyz=(double*)malloc(n*sizeof(double)),*gupzz=(double*)malloc(n*sizeof(double));
    if (!chix||!chiy||!chiz||!Exx||!Exy||!Exz||!Eyx||!Eyy||!Eyz||!Ezx||!Ezy||!Ezz||
        !Bxx||!Bxy||!Bxz||!Byx||!Byy||!Byz||!Bzx||!Bzy||!Bzz||
        !Kpsix||!Kpsiy||!Kpsiz||!Kphix||!Kphiy||!Kphiz||
        !Lapx||!Lapy||!Lapz||
        !betaxx||!betaxy||!betaxz||!betayx||!betayy||!betayz||!betazx||!betazy||!betazz||
        !gxxx||!gxxy||!gxxz||!gxyx||!gxyy||!gxyz||!gxzx||!gxzy||!gxzz||
        !gyyx||!gyyy||!gyyz||!gyzx||!gyzy||!gyzz||!gzzx||!gzzy||!gzzz||
        !gupxx||!gupxy||!gupxz||!gupyy||!gupyz||!gupzz) {
        gont = 1;
    }
    /* ==== 1. Compute all derivatives ==== */
    if (!gont) {
    /* metric derivatives */
    fderivs(ex, Lap, Lapx, Lapy, Lapz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, betax, betaxx, betaxy, betaxz, X, Y, Z, ANTI, SYM, SYM, Symmetry, Lev);
    fderivs(ex, betay, betayx, betayy, betayz, X, Y, Z, SYM, ANTI, SYM, Symmetry, Lev);
    fderivs(ex, betaz, betazx, betazy, betazz, X, Y, Z, SYM, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, chi, chix, chiy, chiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, dxx, gxxx, gxxy, gxxz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, gxy, gxyx, gxyy, gxyz, X, Y, Z, ANTI, ANTI, SYM, Symmetry, Lev);
    fderivs(ex, gxz, gxzx, gxzy, gxzz, X, Y, Z, ANTI, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, dyy, gyyx, gyyy, gyyz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, gyz, gyzx, gyzy, gyzz, X, Y, Z, SYM, ANTI, ANTI, Symmetry, Lev);
    fderivs(ex, dzz, gzzx, gzzy, gzzz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    /* EM field derivatives */
    fderivs(ex, Kpsi, Kpsix, Kpsiy, Kpsiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, Kphi, Kphix, Kphiy, Kphiz, X, Y, Z, SYM, SYM, SYM, Symmetry, Lev);
    fderivs(ex, Ex, Exx, Exy, Exz, X, Y, Z, ANTI, SYM, SYM, Symmetry, Lev);
    fderivs(ex, Ey, Eyx, Eyy, Eyz, X, Y, Z, SYM, ANTI, SYM, Symmetry, Lev);
    fderivs(ex, Ez, Ezx, Ezy, Ezz, X, Y, Z, SYM, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, Bx, Bxx, Bxy, Bxz, X, Y, Z, SYM, ANTI, ANTI, Symmetry, Lev);
    fderivs(ex, By, Byx, Byy, Byz, X, Y, Z, ANTI, SYM, ANTI, Symmetry, Lev);
    fderivs(ex, Bz, Bzx, Bzy, Bzz, X, Y, Z, ANTI, ANTI, SYM, Symmetry, Lev);
    /* ==== 2. Compute EM RHS and stress-energy ==== */
    const double F1o4PI = ONE / (FOUR * PI);
    for (size_t i = 0; i < n; ++i) {
        const double alpn1 = Lap[i] + ONE;
        const double chin1 = chi[i] + ONE;
        const double chi3o2 = sqrt(chin1) * chin1;  // chi^{3/2}
        const double ichi = ONE / chin1;
        /* physical metric */
        const double pgxx = (dxx[i] + ONE) * ichi;
        const double pgyy = (dyy[i] + ONE) * ichi;
        const double pgzz = (dzz[i] + ONE) * ichi;
        const double pgxy = gxy[i] * ichi;
        const double pgxz = gxz[i] * ichi;
        const double pgyz = gyz[i] * ichi;
        /* inverse physical metric */
        const double det = pgxx * pgyy * pgzz + pgxy * pgyz * pgxz + pgxz * pgxy * pgyz
                         - pgxz * pgyy * pgxz - pgxy * pgxy * pgzz - pgxx * pgyz * pgyz;
        const double idet = ONE / det;
        const double upxx = (pgyy * pgzz - pgyz * pgyz) * idet;
        const double upxy = -(pgxy * pgzz - pgyz * pgxz) * idet;
        const double upxz = (pgxy * pgyz - pgyy * pgxz) * idet;
        const double upyy = (pgxx * pgzz - pgxz * pgxz) * idet;
        const double upyz = -(pgxx * pgyz - pgxy * pgxz) * idet;
        const double upzz = (pgxx * pgyy - pgxy * pgxy) * idet;
        gupxx[i]=upxx; gupxy[i]=upxy; gupxz[i]=upxz;
        gupyy[i]=upyy; gupyz[i]=upyz; gupzz[i]=upzz;
        /* E-field RHS */
        /* curl(B) part: epsilon^{ijk} ∂_j (alpha * B_k) in coordinate basis */
        /* Using lower-index B fields: B_i_lower = pg_{ij} * B^j */
        const double BxL = pgxx*Bx[i] + pgxy*By[i] + pgxz*Bz[i];
        const double ByL = pgxy*Bx[i] + pgyy*By[i] + pgyz*Bz[i];
        const double BzL = pgxz*Bx[i] + pgyz*By[i] + pgzz*Bz[i];
        /* Physical metric derivatives (chain rule from conformal) */
        const double pgxx_x = (gxxx[i] - pgxx*chix[i]) * ichi;
        /* const double pgxx_y = (gxxy[i] - pgxx*chiy[i]) * ichi; */
        const double pgxy_x = (gxyx[i] - pgxy*chix[i]) * ichi;
        const double pgxy_y = (gxyy[i] - pgxy*chiy[i]) * ichi;
        const double pgxz_x = (gxzx[i] - pgxz*chix[i]) * ichi;
        const double pgxz_z = (gxzz[i] - pgxz*chiz[i]) * ichi;
        const double pgyy_y = (gyyy[i] - pgyy*chiy[i]) * ichi;
        const double pgyz_y = (gyzy[i] - pgyz*chiy[i]) * ichi;
        const double pgyz_z = (gyzz[i] - pgyz*chiz[i]) * ichi;
        const double pgzz_z = (gzzz[i] - pgzz*chiz[i]) * ichi;
        /* Curl_x(B) = ∂_y (alpha*BzL) - ∂_z (alpha*ByL) */
        const double aBx = alpn1*BxL, aBy = alpn1*ByL, aBz = alpn1*BzL;
        const double curlBx = (aBz*Lapy[i] + alpn1*(pgxz*Bxy[i]+pgyz*Byy[i]+pgzz*Bzy[i]) + alpn1*(Bx[i]*gxzy[i]+By[i]*gyzy[i]+Bz[i]*gzzy[i]))
                            - (aBy*Lapz[i] + alpn1*(pgxy*Bxz[i]+pgyy*Byz[i]+pgyz*Bzz[i]) + alpn1*(Bx[i]*gxyz[i]+By[i]*gyyz[i]+Bz[i]*gyzz[i]));
        double curlBy = (aBx*Lapz[i] + alpn1*(pgxx*Bxz[i]+pgxy*Byz[i]+pgxz*Bzz[i]) + alpn1*(Bx[i]*gxxz[i]+By[i]*gxyz[i]+Bz[i]*gxzz[i]))
                      - (aBz*Lapx[i] + alpn1*(pgxz*Bxx[i]+pgyz*Byx[i]+pgzz*Bzx[i]) + alpn1*(Bx[i]*gxzx[i]+By[i]*gyzx[i]+Bz[i]*gzzx[i]));
        double curlBz = (aBy*Lapx[i] + alpn1*(pgxy*Bxx[i]+pgyy*Byx[i]+pgyz*Bzx[i]) + alpn1*(Bx[i]*gxyx[i]+By[i]*gyyx[i]+Bz[i]*gyzx[i]))
                      - (aBx*Lapy[i] + alpn1*(pgxx*Bxy[i]+pgxy*Byy[i]+pgxz*Bzy[i]) + alpn1*(Bx[i]*gxxy[i]+By[i]*gxyy[i]+Bz[i]*gxzy[i]));
        /* Advection part: -beta^j * ∂_j E^i */
        const double advEx = Ex[i]*betaxx[i] + Ey[i]*betaxy[i] + Ez[i]*betaxz[i];
        const double advEy = Ex[i]*betayx[i] + Ey[i]*betayy[i] + Ez[i]*betayz[i];
        const double advEz = Ex[i]*betazx[i] + Ey[i]*betazy[i] + Ez[i]*betazz[i];
        /* grad(Kpsi) contracted with inverse metric */
        const double gupKx = upxx*Kpsix[i] + upxy*Kpsiy[i] + upxz*Kpsiz[i];
        const double gupKy = upxy*Kpsix[i] + upyy*Kpsiy[i] + upyz*Kpsiz[i];
        const double gupKz = upxz*Kpsix[i] + upyz*Kpsiy[i] + upzz*Kpsiz[i];
        Ex_rhs[i] = alpn1*trK[i]*Ex[i] - advEx - FOUR*PI*alpn1*Jx[i] - alpn1*gupKx + chi3o2*curlBx;
        Ey_rhs[i] = alpn1*trK[i]*Ey[i] - advEy - FOUR*PI*alpn1*Jy[i] - alpn1*gupKy + chi3o2*curlBy;
        Ez_rhs[i] = alpn1*trK[i]*Ez[i] - advEz - FOUR*PI*alpn1*Jz[i] - alpn1*gupKz + chi3o2*curlBz;
        /* B-field RHS: similar but with -chi^{3/2} * curl(E) and grad(Kphi) */
        const double ExL = pgxx*Ex[i] + pgxy*Ey[i] + pgxz*Ez[i];
        const double EyL = pgxy*Ex[i] + pgyy*Ey[i] + pgyz*Ez[i];
        const double EzL = pgxz*Ex[i] + pgyz*Ey[i] + pgzz*Ez[i];
        const double aEx = alpn1*ExL, aEy = alpn1*EyL, aEz = alpn1*EzL;
        const double curlEx = (aEz*Lapy[i] + alpn1*(pgxz*Exy[i]+pgyz*Eyy[i]+pgzz*Ezy[i]) + alpn1*(Ex[i]*gxzy[i]+Ey[i]*gyzy[i]+Ez[i]*gzzy[i]))
                            - (aEy*Lapz[i] + alpn1*(pgxy*Exz[i]+pgyy*Eyz[i]+pgyz*Ezz[i]) + alpn1*(Ex[i]*gxyz[i]+Ey[i]*gyyz[i]+Ez[i]*gyzz[i]));
        double curlEy = (aEx*Lapz[i] + alpn1*(pgxx*Exz[i]+pgxy*Eyz[i]+pgxz*Ezz[i]) + alpn1*(Ex[i]*gxxz[i]+Ey[i]*gxyz[i]+Ez[i]*gxzz[i]))
                      - (aEz*Lapx[i] + alpn1*(pgxz*Exx[i]+pgyz*Eyx[i]+pgzz*Ezx[i]) + alpn1*(Ex[i]*gxzx[i]+Ey[i]*gyzx[i]+Ez[i]*gzzx[i]));
        double curlEz = (aEy*Lapx[i] + alpn1*(pgxy*Exx[i]+pgyy*Eyx[i]+pgyz*Ezx[i]) + alpn1*(Ex[i]*gxyx[i]+Ey[i]*gyyx[i]+Ez[i]*gyzx[i]))
                      - (aEx*Lapy[i] + alpn1*(pgxx*Exy[i]+pgxy*Eyy[i]+pgxz*Ezy[i]) + alpn1*(Ex[i]*gxxy[i]+Ey[i]*gxyy[i]+Ez[i]*gxzy[i]));
        const double advBx = Bx[i]*betaxx[i] + By[i]*betaxy[i] + Bz[i]*betaxz[i];
        const double advBy = Bx[i]*betayx[i] + By[i]*betayy[i] + Bz[i]*betayz[i];
        const double advBz = Bx[i]*betazx[i] + By[i]*betazy[i] + Bz[i]*betazz[i];
        const double gupKphix = upxx*Kphix[i] + upxy*Kphiy[i] + upxz*Kphiz[i];
        const double gupKphiy = upxy*Kphix[i] + upyy*Kphiy[i] + upyz*Kphiz[i];
        const double gupKphiz = upxz*Kphix[i] + upyz*Kphiy[i] + upzz*Kphiz[i];
        Bx_rhs[i] = alpn1*trK[i]*Bx[i] - advBx - alpn1*gupKphix - chi3o2*curlEx;
        By_rhs[i] = alpn1*trK[i]*By[i] - advBy - alpn1*gupKphiy - chi3o2*curlEy;
        Bz_rhs[i] = alpn1*trK[i]*Bz[i] - advBz - alpn1*gupKphiz - chi3o2*curlEz;
        /* Scalar potential RHS */
        const double divE = Exx[i] + Eyy[i] + Ezz[i];
        const double divB = Bxx[i] + Byy[i] + Bzz[i];
        const double chiCont = F3o2 * ichi * (chix[i]*Ex[i] + chiy[i]*Ey[i] + chiz[i]*Ez[i]);
        Kpsi_rhs[i] = FOUR*PI*alpn1*qchar[i] - alpn1*kappa*Kpsi[i] - alpn1*(divE - chiCont);
        Kphi_rhs[i] = -alpn1*kappa*Kphi[i] - alpn1*(divB - F3o2*ichi*(chix[i]*Bx[i] + chiy[i]*By[i] + chiz[i]*Bz[i]));
        /* Stress-energy tensor */
        const double E2 = pgxx*Ex[i]*Ex[i] + pgyy*Ey[i]*Ey[i] + pgzz*Ez[i]*Ez[i]
                        + TWO*(pgxy*Ex[i]*Ey[i] + pgxz*Ex[i]*Ez[i] + pgyz*Ey[i]*Ez[i]);
        const double B2 = pgxx*Bx[i]*Bx[i] + pgyy*By[i]*By[i] + pgzz*Bz[i]*Bz[i]
                        + TWO*(pgxy*Bx[i]*By[i] + pgxz*Bx[i]*Bz[i] + pgyz*By[i]*Bz[i]);
        rho[i] = (E2 + B2) / (EIT * PI);
        const double ichi3o2 = ONE / chi3o2;
        Sx[i] = (Ey[i]*Bz[i] - Ez[i]*By[i]) * F1o4PI * ichi3o2;
        Sy[i] = (Ez[i]*Bx[i] - Ex[i]*Bz[i]) * F1o4PI * ichi3o2;
        Sz[i] = (Ex[i]*By[i] - Ey[i]*Bx[i]) * F1o4PI * ichi3o2;
        const double lExi = pgxx*Ex[i] + pgxy*Ey[i] + pgxz*Ez[i];
        const double lEyi = pgxy*Ex[i] + pgyy*Ey[i] + pgyz*Ez[i];
        const double lEzi = pgxz*Ex[i] + pgyz*Ey[i] + pgzz*Ez[i];
        const double lBxi = pgxx*Bx[i] + pgxy*By[i] + pgxz*Bz[i];
        const double lByi = pgxy*Bx[i] + pgyy*By[i] + pgyz*Bz[i];
        const double lBzi = pgxz*Bx[i] + pgyz*By[i] + pgzz*Bz[i];
        Sxx[i] = rho[i]*pgxx - (lExi*lExi + lBxi*lBxi) * F1o4PI;
        Sxy[i] = rho[i]*pgxy - (lExi*lEyi + lBxi*lByi) * F1o4PI;
        Sxz[i] = rho[i]*pgxz - (lExi*lEzi + lBxi*lBzi) * F1o4PI;
        Syy[i] = rho[i]*pgyy - (lEyi*lEyi + lByi*lByi) * F1o4PI;
        Syz[i] = rho[i]*pgyz - (lEyi*lEzi + lByi*lBzi) * F1o4PI;
        Szz[i] = rho[i]*pgzz - (lEzi*lEzi + lBzi*lBzi) * F1o4PI;
    }
    /* ==== 3. Call BSSN RHS with EM stress-energy ==== */
    gont = f_compute_rhs_bssn(ex, T, X, Y, Z,
        chi, trK, dxx, gxy, gxz, dyy, gyz, dzz,
        Axx, Axy, Axz, Ayy, Ayz, Azz,
        Gamx, Gamy, Gamz, Lap, betax, betay, betaz, dtSfx, dtSfy, dtSfz,
        chi_rhs, trK_rhs,
        gxx_rhs, gxy_rhs, gxz_rhs, gyy_rhs, gyz_rhs, gzz_rhs,
        Axx_rhs, Axy_rhs, Axz_rhs, Ayy_rhs, Ayz_rhs, Azz_rhs,
        Gamx_rhs, Gamy_rhs, Gamz_rhs, Lap_rhs, betax_rhs, betay_rhs, betaz_rhs,
        dtSfx_rhs, dtSfy_rhs, dtSfz_rhs,
        rho, Sx, Sy, Sz, Sxx, Sxy, Sxz, Syy, Syz, Szz,
        Gamxxx, Gamxxy, Gamxxz, Gamxyy, Gamxyz, Gamxzz,
        Gamyxx, Gamyxy, Gamyxz, Gamyyy, Gamyyz, Gamyzz,
        Gamzxx, Gamzxy, Gamzxz, Gamzyy, Gamzyz, Gamzzz,
        Rxx, Rxy, Rxz, Ryy, Ryz, Rzz,
        ham_Res, movx_Res, movy_Res, movz_Res,
        Gmx_Res, Gmy_Res, Gmz_Res,
        Symmetry, Lev, eps, co);
    if (!gont) {
    /* ==== 4. Advection terms for EM fields ==== */
    lopsided(ex, X, Y, Z, Kpsi, Kpsi_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Kphi, Kphi_rhs, betax, betay, betaz, Symmetry, SSS);
    lopsided(ex, X, Y, Z, Ex,   Ex_rhs,   betax, betay, betaz, Symmetry, ASS);
    lopsided(ex, X, Y, Z, Ey,   Ey_rhs,   betax, betay, betaz, Symmetry, SAS);
    lopsided(ex, X, Y, Z, Ez,   Ez_rhs,   betax, betay, betaz, Symmetry, SSA);
    lopsided(ex, X, Y, Z, Bx,   Bx_rhs,   betax, betay, betaz, Symmetry, SAA);
    lopsided(ex, X, Y, Z, By,   By_rhs,   betax, betay, betaz, Symmetry, ASA);
    lopsided(ex, X, Y, Z, Bz,   Bz_rhs,   betax, betay, betaz, Symmetry, AAS);
    /* ==== 5. KO dissipation for EM fields ==== */
    if (eps > ZEO) {
        kodis(ex, X, Y, Z, Kpsi, Kpsi_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Kphi, Kphi_rhs, SSS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ex,   Ex_rhs,   ASS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ey,   Ey_rhs,   SAS, Symmetry, eps);
        kodis(ex, X, Y, Z, Ez,   Ez_rhs,   SSA, Symmetry, eps);
        kodis(ex, X, Y, Z, Bx,   Bx_rhs,   SAA, Symmetry, eps);
        kodis(ex, X, Y, Z, By,   By_rhs,   ASA, Symmetry, eps);
        kodis(ex, X, Y, Z, Bz,   Bz_rhs,   AAS, Symmetry, eps);
    }
    /* ==== 6. NaN check ==== */
        for (int i = 0; i < all; ++i) {
            if (!isfinite(Ex_rhs[i]+Ey_rhs[i]+Ez_rhs[i]+Bx_rhs[i]+By_rhs[i]+Bz_rhs[i]+Kpsi_rhs[i]+Kphi_rhs[i])) {
                gont = 1; break;
            }
        }
        } /* inner if (!gont) */
    } /* outer if (!gont) */
    free(chix);free(chiy);free(chiz);
    free(Exx);free(Exy);free(Exz);free(Eyx);free(Eyy);free(Eyz);free(Ezx);free(Ezy);free(Ezz);
    free(Bxx);free(Bxy);free(Bxz);free(Byx);free(Byy);free(Byz);free(Bzx);free(Bzy);free(Bzz);
    free(Kpsix);free(Kpsiy);free(Kpsiz);
    free(Kphix);free(Kphiy);free(Kphiz);
    free(Lapx);free(Lapy);free(Lapz);
    free(betaxx);free(betaxy);free(betaxz);free(betayx);free(betayy);free(betayz);free(betazx);free(betazy);free(betazz);
    free(gxxx);free(gxxy);free(gxxz);free(gxyx);free(gxyy);free(gxyz);free(gxzx);free(gxzy);free(gxzz);
    free(gyyx);free(gyyy);free(gyyz);free(gyzx);free(gyzy);free(gyzz);free(gzzx);free(gzzy);free(gzzz);
    free(gupxx);free(gupxy);free(gupxz);free(gupyy);free(gupyz);free(gupzz);
    return gont;
 }
--- a/AMSS_NCKU_source/bssn_rhs.h
+++ b/AMSS_NCKU_source/bssn_rhs.h
@@ -22,32 +22,32 @@
 #define f_compute_rhs_Z4c_ss COMPUTE_RHS_Z4C_SS
 #define f_compute_constraint_fr COMPUTE_CONSTRAINT_FR
 #endif
-#ifdef fortran3
+#ifdef fortran3
-#define f_compute_rhs_bssn compute_rhs_bssn_
+#define f_compute_rhs_bssn compute_rhs_bssn_
 #define f_compute_rhs_bssn_ss compute_rhs_bssn_ss_
 #define f_compute_rhs_bssn_escalar compute_rhs_bssn_escalar_
 #define f_compute_rhs_bssn_escalar_ss compute_rhs_bssn_escalar_ss_
 #define f_compute_rhs_Z4c compute_rhs_z4c_
 #define f_compute_rhs_Z4cnot compute_rhs_z4cnot_
 #define f_compute_rhs_Z4c_ss compute_rhs_z4c_ss_
-#define f_compute_constraint_fr compute_constraint_fr_
+#define f_compute_constraint_fr compute_constraint_fr_
-#endif
+#endif
-
+
-#ifdef __cplusplus
+#ifdef __cplusplus
-extern "C"
+extern "C"
-{
+{
-#endif
+#endif
-        void f_bssn_rhs_kernel_timing_reset();
+        void f_bssn_rhs_kernel_timing_reset();
-        int f_bssn_rhs_kernel_timing_bucket_count();
+        int f_bssn_rhs_kernel_timing_bucket_count();
-        const double *f_bssn_rhs_kernel_timing_local_seconds();
+        const double *f_bssn_rhs_kernel_timing_local_seconds();
-        const char *f_bssn_rhs_kernel_timing_label(int);
+        const char *f_bssn_rhs_kernel_timing_label(int);
-#ifdef __cplusplus
+#ifdef __cplusplus
-}
+}
-#endif
+#endif
-
+
-extern "C"
+extern "C"
-{
+{
-        int f_compute_rhs_bssn(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
+        int f_compute_rhs_bssn(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
                               double *, double *,                                                                                 // chi, trK
                               double *, double *, double *, double *, double *, double *,                                         // gij
                               double *, double *, double *, double *, double *, double *,                                         // Aij
@@ -67,6 +67,27 @@ extern "C"
                               int &, int &, double &, int &);
 }
 int f_compute_rhs_bssn_escalar_c(int *, double &, double *, double *, double *,                                                      // ex,T,X,Y,Z
                                 double *, double *,                                                                                 // chi, trK
                                 double *, double *, double *, double *, double *, double *,                                         // gij
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *,                                                                                 // chi, trK
                                 double *, double *, double *, double *, double *, double *,                                         // gij
                                 double *, double *, double *, double *, double *, double *,                                         // Aij
                                 double *, double *, double *,                                                                       // Gam
                                 double *, double *, double *, double *, double *, double *, double *,                               // Gauge
                                 double *, double *,                                                                                 // Sphi, Spi
                                 double *, double *, double *, double *, double *, double *, double *, double *, double *, double *, // stress-energy
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Christoffel
                                 double *, double *, double *, double *, double *, double *,                                         // Ricci
                                 double *, double *, double *, double *, double *, double *, double *,                               // constraint violation
                                 int &, int &, double &, int &);
 extern "C"
 {
        int f_compute_rhs_bssn_ss(int *, double &, double *, double *, double *,                                                      // ex,T,rho,sigma,R
@@ -241,4 +262,31 @@ extern "C"
                                     double *);
 } // FR_cons
 // BSSN-EM C kernel (replaces empart.f90 + bssn_rhs.f90 for BSSN+Maxwell)
 int f_compute_rhs_bssn_em_c(int *, double &, double *, double *, double *,
                             double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *,
                             double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *, double *, double *,
                             double *, double *, double *,
                             double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             double *, double *, double *, double *, double *, double *,
                             int &, int &, double &, int &);
 #endif /* BSSN_H */
--- a/AMSS_NCKU_source/makefile
+++ b/AMSS_NCKU_source/makefile
@@ -32,6 +32,24 @@ $(error USE_CXX_ESCALAR_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_escalar
 endif
 endif
 ifeq ($(USE_CXX_EM_KERNEL),1)
 ifeq ($(ABE_TYPE),3)
 EFFECTIVE_USE_CXX_EM_KERNEL = 1
 else
 EFFECTIVE_USE_CXX_EM_KERNEL = 0
 endif
 else
 EFFECTIVE_USE_CXX_EM_KERNEL = 0
 endif
 ifeq ($(EFFECTIVE_USE_CXX_EM_KERNEL),1)
 ifeq ($(USE_CXX_KERNELS),0)
 $(error USE_CXX_EM_KERNEL=1 requires USE_CXX_KERNELS=1 because bssn_em_rhs_c.C reuses the C BSSN kernel)
 endif
 endif
 EM_KERNEL_FLAG = -DBSSN_USE_EM_C_KERNEL=$(EFFECTIVE_USE_CXX_EM_KERNEL)
 ## polint(ordn=6) kernel selector:
 ##   1 (default): barycentric fast path
 ##   0          : fallback to Neville path
@@ -49,7 +67,7 @@ ifeq ($(PGO_MODE),instrument)
 ## Phase 1: instrumentation — omit -ipo/-fp-model fast=2 for faster build and numerical stability
 CXXAPPFLAGS = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
-              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
+              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
 f90appflags = -O3 -xHost -fma -fprofile-instr-generate -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 else
@@ -60,7 +78,7 @@ else
 CXXAPPFLAGS = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -Dfortran3 -Dnewc -I${MKLROOT}/include $(INTERP_LB_FLAGS) \
-              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG)
+              $(TRANSFER_CACHE_FLAG) $(ESCALAR_KERNEL_FLAG) $(EM_KERNEL_FLAG)
 f90appflags = -O3 -xHost -fp-model fast=2 -fma -ipo \
              -align array64byte -fpp -I${MKLROOT}/include $(POLINT6_FLAG)
 endif
@@ -118,6 +136,9 @@ fdderivs_shc_c.o: fdderivs_shc_c.C
 kodiss_sh_c.o: kodiss_sh_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 bssn_em_rhs_c.o: bssn_em_rhs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
 z4c_rhs_c.o: z4c_rhs_c.C
 	${CXX} $(CXXAPPFLAGS) -c $< $(filein) -o $@
@@ -148,6 +169,9 @@ CFILES = bssn_rhs_c.o fderivs_c.o fdderivs_c.o kodiss_c.o lopsided_c.o lopsided_
 ifeq ($(EFFECTIVE_USE_CXX_ESCALAR_KERNEL),1)
 CFILES += bssn_escalar_rhs_c.o
 endif
 ifeq ($(EFFECTIVE_USE_CXX_EM_KERNEL),1)
 CFILES += bssn_em_rhs_c.o
 endif
 endif
 ifeq ($(USE_CXX_Z4C_KERNELS),1)
--- a/AMSS_NCKU_source/makefile.inc
+++ b/AMSS_NCKU_source/makefile.inc
@@ -1,7 +1,28 @@
-## Toolchain selection
+## GCC version (commented out)
-##   nvhpc : NVIDIA HPC SDK + CUDA-aware MPI (default)
+## filein  = -I/usr/include -I/usr/lib/x86_64-linux-gnu/mpich/include -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-##   intel : Intel oneAPI toolchain (legacy path)
+## filein  = -I/usr/include/ -I/usr/include/openmpi-x86_64/ -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -I/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/usr/lib/gcc/x86_64-linux-gnu/11/ -I/usr/include/c++/11/
-TOOLCHAIN ?= intel
+## LDLIBS  = -L/usr/lib/x86_64-linux-gnu -L/usr/lib64 -L/usr/lib/gcc/x86_64-linux-gnu/11 -lgfortran -lmpi -lgfortran
 ## Intel oneAPI version with oneMKL (Optimized for performance)
 filein  = -I/usr/include/ -I${MKLROOT}/include
 ## Using sequential MKL (OpenMP disabled for better single-threaded performance)
 ## Added -lifcore for Intel Fortran runtime and -limf for Intel math library
 LDLIBS  = -L${MKLROOT}/lib -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lifcore -limf -lpthread -lm -ldl -liomp5
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 ## PGO build mode switch (ABE only; TwoPunctureABE always uses opt flags)
 ##   opt        : (default) maximum performance with PGO profile-guided optimization
@@ -22,14 +43,6 @@ else
 INTERP_LB_FLAGS =
 endif
 MKLROOT ?= /home/intel/oneapi/mkl/latest
 MKL_LIBDIR ?= $(MKLROOT)/lib/intel64
 MKL_INC ?= -I$(MKLROOT)/include
 NVHPC_ROOT ?= /home/nvidia/hpc_sdk/Linux_x86_64/25.11
 CUDA_HOME  ?= $(NVHPC_ROOT)/cuda
 CUDA_ARCH  ?= sm_80
 ## Kernel implementation switch
 ##   1 (default) : use C++ rewrite of bssn_rhs and helper kernels (faster)
 ##   0           : fall back to original Fortran kernels
@@ -40,52 +53,36 @@ USE_CXX_KERNELS ?= 1
 ##   0           : use original Fortran Z4c_rhs.o
 USE_CXX_Z4C_KERNELS ?= 1
 ## BSSN-EScalar RHS switch
 ##   1 (default) : use BSSN-EScalar C wrapper on the normal patch path
 ##   0           : keep the original Fortran BSSN-EScalar RHS for precision-safe runs
 ## Note: this requires USE_CXX_KERNELS=1 because the wrapper reuses the C BSSN kernel.
 USE_CXX_ESCALAR_KERNEL ?= 1
 ## BSSN-EM RHS switch
 ##   1 : use BSSN-EM C kernel (bssn_em_rhs_c.C) on the normal patch path
 ##   0 : keep the original Fortran empart.f90 RHS for the EM fields (default)
 ## Note: experimental, requires USE_CXX_KERNELS=1
 USE_CXX_EM_KERNEL ?= 0
 ## Cached transfer switch
 ##   auto (default): enable for BSSN vacuum, keep other paths on the safe uncached path
 ##   1             : force cached Sync/Restrict/OutBd transfer on evolution hot paths
 ##   0             : force the original uncached transfer path
 USE_TRANSFER_CACHE ?= auto
 ## RK4 kernel implementation switch
 ##   1 (default) : use C/C++ rewrite of rungekutta4_rout (for optimization experiments)
 ##   0           : use original Fortran rungekutta4_rout.o
 USE_CXX_RK4 ?= 1
 ## Memory allocator switch
 ##   1 (default) : link Intel oneTBB allocator (libtbbmalloc)
 ##   0           : use system default allocator (ptmalloc)
 USE_TBBMALLOC ?= 1
 TBBMALLOC_SO ?= /home/intel/oneapi/2025.3/lib/libtbbmalloc.so
 ifneq ($(wildcard $(TBBMALLOC_SO)),)
 TBBMALLOC_LIBS = -Wl,--no-as-needed $(TBBMALLOC_SO) -Wl,--as-needed
 else
 TBBMALLOC_LIBS = -Wl,--no-as-needed -ltbbmalloc -Wl,--as-needed
 endif
 ifeq ($(TOOLCHAIN),intel)
 f90          = ifx
 f77          = ifx
 CXX          = icpx
 CC           = icx
 CLINKER      = mpiicpx
 filein  = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
 LDLIBS       = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
               -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
               -lifcore -limf -liomp5 -lpthread -lm -ldl \
               -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart
 else ifeq ($(TOOLCHAIN),nvhpc)
 f90          = mpifort
 f77          = mpifort
 CXX          = mpicxx
 CC           = mpicc
 CLINKER      = mpicxx
-filein       = -I/usr/include/ $(MKL_INC) -I$(CUDA_HOME)/include
+Cu = nvcc
-LDLIBS       = -L$(MKL_LIBDIR) -Wl,-rpath,$(MKL_LIBDIR) \
+CUDA_LIB_PATH = -L/usr/lib/cuda/lib64 -I/usr/include -I/usr/lib/cuda/include
-               -lmkl_intel_lp64 -lmkl_sequential -lmkl_core \
+#CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -arch compute_13 -code compute_13,sm_13 -Dfortran3 -Dnewc
-               -lpthread -lm -ldl \
+CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc
               -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64 -lcuda -lcudart \
               -fortranlibs
 endif
 ifeq ($(USE_TBBMALLOC),1)
 LDLIBS := $(TBBMALLOC_LIBS) $(LDLIBS)
 endif
 Cu = $(NVHPC_ROOT)/compilers/bin/nvcc
 CUDA_LIB_PATH = -L$(CUDA_HOME)/lib64 -I$(CUDA_HOME)/include
 CUDA_APP_FLAGS = -c -g -O3 --ptxas-options=-v -Dfortran3 -Dnewc -arch=$(CUDA_ARCH)