diff --git a/baseline_mps_expectation.py b/baseline_mps_expectation.py
index c1f5fa1..8179a31 100644
--- a/baseline_mps_expectation.py
+++ b/baseline_mps_expectation.py
@@ -1,18 +1,17 @@
 """Baseline MPS expectation scan with the qmatchatea backend."""
 
 import argparse
+import json
 import logging
 import math
 import time
 
+import numpy as np
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Z
 
 from qibotn.backends.qmatchatea import QMatchaTeaBackend
-
-
-def parse_bonds(value):
-    return [int(item) for item in value.split(",") if item.strip()]
+from qibotn.backends.vidal_tebd import run_vidal_ring_xz
 
 
 def build_circuit(nqubits, nlayers, seed):
@@ -33,9 +32,8 @@ def build_circuit(nqubits, nlayers, seed):
 
 def build_observable(nqubits):
     form = 0
-    for qubit in range(nqubits - 1):
-        form += 0.5 * Z(qubit) * Z(qubit + 1)
-    form += 0.25 * X(0)
+    for qubit in range(nqubits):
+        form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
     return hamiltonians.SymbolicHamiltonian(form=form)
 
 
@@ -43,86 +41,123 @@ def exact_expectation(circuit, nqubits):
     import numpy as np
 
     state = circuit().state(numpy=True).reshape(-1)
-    probabilities = np.abs(state) ** 2
-    indices = np.arange(state.size)
 
     value = 0.0
-    for qubit in range(nqubits - 1):
-        left = (indices >> (nqubits - 1 - qubit)) & 1
-        right = (indices >> (nqubits - 2 - qubit)) & 1
-        value += 0.5 * np.sum(probabilities * (1 - 2 * left) * (1 - 2 * right))
-
-    flip_q0 = 1 << (nqubits - 1)
-    value += 0.25 * np.vdot(state[indices ^ flip_q0], state).real
+    chunk_size = 1 << 20
+    for qubit in range(nqubits):
+        next_qubit = (qubit + 1) % nqubits
+        x_flip = 1 << (nqubits - 1 - qubit)
+        z_shift = nqubits - 1 - next_qubit
+        term = 0.0
+        for start in range(0, state.size, chunk_size):
+            stop = min(start + chunk_size, state.size)
+            indices = np.arange(start, stop, dtype=np.int64)
+            z_bit = (indices >> z_shift) & 1
+            z_phase = 1 - 2 * z_bit
+            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
+        value += 0.5 * term
     return float(value)
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=20)
-    parser.add_argument("--nlayers", type=int, default=8)
-    parser.add_argument("--bonds", type=parse_bonds, default=parse_bonds("2,4,8,16,32"))
+    parser.add_argument("--nqubits", type=int, default=40)
+    parser.add_argument("--nlayers", type=int, default=30)
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
     parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--cut-ratio", type=float, default=1e-12)
-    parser.add_argument("--svd-control", default="V")
-    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="numpy")
-    parser.add_argument("--torch-threads", type=int)
+    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
+    parser.add_argument("--torch-threads", type=int, default=32)
+    parser.add_argument(
+        "--executor", choices=("qmatchatea", "vidal"), default="qmatchatea"
+    )
+    parser.add_argument("--vidal-workers", type=int, default=1)
+    parser.add_argument("--vidal-batched", action="store_true")
+    parser.add_argument("--mpi-ct", action="store_true")
+    parser.add_argument("--mpi-barriers", type=int, default=-1)
+    parser.add_argument("--mpi-isometrization", type=int, default=-1)
     parser.add_argument("--exact", action="store_true")
     parser.add_argument("--exact-max-qubits", type=int, default=24)
-    parser.add_argument("--preprocess", action="store_true")
-    parser.add_argument("--compile-circuit", action="store_true")
-    parser.add_argument("--track-memory", action="store_true")
+    parser.add_argument("--reference-file")
     args = parser.parse_args()
     logging.getLogger("qibo.config").setLevel(logging.ERROR)
     logging.getLogger("qtealeaves").setLevel(logging.ERROR)
-    if args.torch_threads is not None:
-        import torch
+    import torch
 
-        torch.set_num_threads(args.torch_threads)
+    torch.set_num_threads(args.torch_threads)
+    rank = 0
+    size = 1
+    if args.mpi_ct:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+        size = MPI.COMM_WORLD.Get_size()
 
     circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
     observable = build_observable(args.nqubits)
     exact = None
-    if args.exact:
+    if args.reference_file:
+        with open(args.reference_file, "r", encoding="utf-8") as f:
+            exact = float(json.load(f)["expectation"])
+    elif args.exact:
         if args.nqubits > args.exact_max_qubits:
             raise ValueError(
                 f"--exact is limited to {args.exact_max_qubits} qubits by default."
             )
         exact = exact_expectation(circuit, args.nqubits)
 
-    print(
-        f"nqubits={args.nqubits} nlayers={args.nlayers} "
-        f"seed={args.seed} preprocess={args.preprocess} "
-        f"tensor_module={args.tensor_module}"
-    )
-    if exact is not None:
-        print(f"exact={exact:.16e}")
-    print("bond_dim expval abs_error rel_error seconds")
+    if rank == 0:
+        mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
+        print(
+            f"nqubits={args.nqubits} nlayers={args.nlayers} "
+            f"bond={args.bond} seed={args.seed} "
+            f"tensor_module={args.tensor_module} svd_control=E! "
+            f"compile_circuit=True mpi={mpi_label} executor={args.executor} "
+            f"vidal_workers={args.vidal_workers} vidal_batched={args.vidal_batched}"
+        )
+        if exact is not None:
+            print(f"exact={exact:.16e}")
+        print("expval abs_error rel_error seconds")
 
-    backend = QMatchaTeaBackend()
-    for bond in args.bonds:
+    start = time.perf_counter()
+    if args.executor == "vidal":
+        if args.mpi_ct:
+            raise ValueError("--executor vidal is a single-process executor.")
+        value = run_vidal_ring_xz(
+            circuit,
+            max_bond=args.bond,
+            cut_ratio=1e-12,
+            tensor_module=args.tensor_module,
+            workers=args.vidal_workers,
+            use_batched=args.vidal_batched,
+        )
+    else:
+        backend = QMatchaTeaBackend()
         backend.configure_tn_simulation(
             ansatz="MPS",
-            max_bond_dimension=bond,
-            cut_ratio=args.cut_ratio,
-            svd_control=args.svd_control,
+            max_bond_dimension=args.bond,
+            cut_ratio=1e-12,
+            svd_control="E!",
             tensor_module=args.tensor_module,
-            compile_circuit=args.compile_circuit,
-            track_memory=args.track_memory,
+            compile_circuit=True,
+            track_memory=False,
+            mpi_approach="CT" if args.mpi_ct else "SR",
+            mpi_num_procs=size,
+            mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1,
+            mpi_isometrization=args.mpi_isometrization,
         )
-        start = time.perf_counter()
-        value = float(
-            backend.expectation(
-                circuit,
-                observable,
-                preprocess=args.preprocess,
-                compile_circuit=args.compile_circuit,
-            ).real
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=False,
+            compile_circuit=True,
         )
-        elapsed = time.perf_counter() - start
-        abs_error = float("nan") if exact is None else abs(value - exact)
-        rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
-        print(f"{bond:d} {value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
+    if rank != 0:
+        return
+    value = float(np.real(value))
+    elapsed = time.perf_counter() - start
+    abs_error = float("nan") if exact is None else abs(value - exact)
+    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
+    print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
 
 
 if __name__ == "__main__":
diff --git a/qibojit_reference_expectation.py b/qibojit_reference_expectation.py
new file mode 100644
index 0000000..429855a
--- /dev/null
+++ b/qibojit_reference_expectation.py
@@ -0,0 +1,109 @@
+"""Compute and cache a qibojit state-vector reference for the ring-XZ observable."""
+
+import argparse
+import json
+import math
+import time
+from pathlib import Path
+
+import numpy as np
+import qibo
+from qibo import Circuit, gates
+
+
+def build_circuit(nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for _ in range(nlayers):
+        for qubit in range(nqubits):
+            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        for qubit in range(0, nqubits - 1, 2):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+        for qubit in range(1, nqubits - 1, 2):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+    return circuit
+
+
+def ring_xz_expectation(state, nqubits, chunk_size):
+    value = 0.0
+    for qubit in range(nqubits):
+        next_qubit = (qubit + 1) % nqubits
+        x_flip = 1 << (nqubits - 1 - qubit)
+        z_shift = nqubits - 1 - next_qubit
+        term = 0.0
+        for start in range(0, state.size, chunk_size):
+            stop = min(start + chunk_size, state.size)
+            indices = np.arange(start, stop, dtype=np.int64)
+            z_bit = (indices >> z_shift) & 1
+            z_phase = 1 - 2 * z_bit
+            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
+        value += 0.5 * term
+    return float(value)
+
+
+def default_output_path(nqubits, nlayers, seed):
+    return Path("references") / (
+        f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json"
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nqubits", type=int, default=32)
+    parser.add_argument("--nlayers", type=int, default=3)
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--output")
+    parser.add_argument("--force", action="store_true")
+    parser.add_argument("--allow-large", action="store_true")
+    parser.add_argument("--max-state-gb", type=float, default=32.0)
+    parser.add_argument("--chunk-size", type=int, default=1 << 20)
+    args = parser.parse_args()
+
+    output = Path(args.output) if args.output else default_output_path(
+        args.nqubits, args.nlayers, args.seed
+    )
+    if output.exists() and not args.force:
+        with open(output, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        print(f"loaded {output}")
+        print(f"expectation={float(data['expectation']):.16e}")
+        return
+
+    state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3)
+    if state_gb > args.max_state_gb and not args.allow_large:
+        raise MemoryError(
+            f"Estimated state vector alone is {state_gb:.1f} GiB. "
+            "Pass --allow-large after confirming the node has enough memory."
+        )
+
+    qibo.set_backend("qibojit")
+    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
+
+    start = time.perf_counter()
+    state = circuit().state(numpy=True).reshape(-1)
+    expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size)
+    elapsed = time.perf_counter() - start
+
+    data = {
+        "backend": "qibojit",
+        "observable": "0.5 * sum_i X_i Z_((i+1) mod n)",
+        "nqubits": args.nqubits,
+        "nlayers": args.nlayers,
+        "seed": args.seed,
+        "expectation": expectation,
+        "seconds": elapsed,
+        "state_vector_gib_estimate": state_gb,
+    }
+    output.parent.mkdir(parents=True, exist_ok=True)
+    with open(output, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, sort_keys=True)
+        f.write("\n")
+
+    print(f"saved {output}")
+    print(f"expectation={expectation:.16e}")
+    print(f"seconds={elapsed:.3f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/qibotn/backends/qmatchatea.py b/src/qibotn/backends/qmatchatea.py
index 6358728..41381dc 100644
--- a/src/qibotn/backends/qmatchatea.py
+++ b/src/qibotn/backends/qmatchatea.py
@@ -9,6 +9,7 @@ import qmatchatea
 import qtealeaves
 from qibo.backends import NumpyBackend
 from qibo.config import raise_error
+from qmatchatea.utils import MPISettings
 
 from qibotn.backends.abstract import QibotnBackend
 from qibotn.observables import check_observable
@@ -43,6 +44,10 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
         compile_circuit: bool = False,
         cache_gate_tensors: bool = True,
         track_memory: bool = False,
+        mpi_approach: str = "SR",
+        mpi_num_procs: int = 1,
+        mpi_where_barriers: int = -1,
+        mpi_isometrization: int = -1,
     ):
         """Configure TN simulation given Quantum Matcha Tea interface.
 
@@ -84,6 +89,12 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
         self.compile_circuit = compile_circuit
         self.cache_gate_tensors = cache_gate_tensors
         self.track_memory = track_memory
+        self.mpi_settings = MPISettings(
+            mpi_approach=mpi_approach,
+            num_procs=mpi_num_procs,
+            where_barriers=mpi_where_barriers,
+            isometrization=mpi_isometrization,
+        )
         if hasattr(self, "qmatchatea_backend"):
             self._setup_backend_specifics()
 
@@ -99,12 +110,12 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
             else "Z" if self.precision == "double" else "A"
         )
 
-        # TODO: once MPI is available for Python, integrate it here
         self.qmatchatea_backend = qmatchatea.QCBackend(
             precision=qmatchatea_precision,
             device=qmatchatea_device,
             ansatz=self.ansatz,
             tensor_module=self.tensor_module,
+            mpi_settings=self.mpi_settings,
         )
         self.qmatchatea_backend.cache_gate_tensors = self.cache_gate_tensors
         self.qmatchatea_backend.track_memory = self.track_memory
@@ -254,6 +265,12 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
             operators=operators,
         )
 
+        if self.qmatchatea_backend.mpi_approach != "SR":
+            from qtealeaves.tooling.mpisupport import MPI
+
+            if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
+                return np.nan
+
         return np.real(results.observables["custom_hamiltonian"])
 
     def _qibocirc_to_qiskitcirc(
diff --git a/src/qibotn/backends/vidal_tebd.py b/src/qibotn/backends/vidal_tebd.py
new file mode 100644
index 0000000..fe324a9
--- /dev/null
+++ b/src/qibotn/backends/vidal_tebd.py
@@ -0,0 +1,419 @@
+"""Vidal/TEBD MPS executor for layer-parallel circuit simulation.
+
+This module is intentionally small and focused on the circuit family used by the
+MPS benchmarks: one-qubit gates and adjacent two-qubit gates on a 1D chain.  It
+keeps the state in Vidal form, so gates acting on disjoint bonds can be applied
+in parallel without moving a global mixed-canonical center.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+def _backend_module(tensor_module):
+    if tensor_module == "torch":
+        import torch
+
+        return torch
+    if tensor_module == "numpy":
+        return np
+    raise ValueError(f"Unsupported tensor module {tensor_module!r}.")
+
+
+def _asarray(xp, value, dtype):
+    if xp is np:
+        return np.asarray(value, dtype=dtype)
+    return xp.as_tensor(value, dtype=dtype)
+
+
+def _ones(xp, size, dtype, device=None):
+    if xp is np:
+        return np.ones(size, dtype=np.float64 if dtype == np.complex128 else np.float32)
+    real_dtype = xp.float64 if dtype == xp.complex128 else xp.float32
+    return xp.ones(size, dtype=real_dtype, device=device)
+
+
+def _eye(xp, size, dtype, device=None):
+    if xp is np:
+        return np.eye(size, dtype=dtype)
+    return xp.eye(size, dtype=dtype, device=device)
+
+
+def _conj(xp, tensor):
+    return np.conjugate(tensor) if xp is np else tensor.conj()
+
+
+def _transpose(xp, tensor, axes):
+    return np.transpose(tensor, axes) if xp is np else tensor.permute(*axes)
+
+
+def _vdot_real(xp, left, right):
+    if xp is np:
+        return np.vdot(left.reshape(-1), right.reshape(-1)).real
+    return xp.vdot(left.reshape(-1), right.reshape(-1)).real
+
+
+def _to_float(x):
+    if hasattr(x, "detach"):
+        return float(x.detach().cpu().item())
+    return float(x)
+
+
+def _svd(xp, matrix):
+    return _svd_eigh(xp, matrix)
+
+
+def _svd_eigh(xp, matrix):
+    """SVD through Hermitian eigendecomposition.
+
+    This mirrors the E-style path that is fast for the benchmark matrices and
+    avoids torch's slower general-purpose SVD for many small/medium splits.
+    """
+
+    m_dim, n_dim = matrix.shape
+    if m_dim <= n_dim:
+        gram = matrix @ _conj(xp, matrix).T
+        eigvals, eigvecs = _eigh(xp, gram)
+        eigvals, eigvecs = _sort_eigh_desc(xp, eigvals, eigvecs)
+        singvals = _sqrt_clamped(xp, eigvals)
+        inv_s = _safe_inverse(xp, singvals)
+        vh = (_conj(xp, eigvecs).T @ matrix) * inv_s.reshape(-1, 1)
+        return eigvecs, singvals, vh
+
+    gram = _conj(xp, matrix).T @ matrix
+    eigvals, eigvecs = _eigh(xp, gram)
+    eigvals, eigvecs = _sort_eigh_desc(xp, eigvals, eigvecs)
+    singvals = _sqrt_clamped(xp, eigvals)
+    inv_s = _safe_inverse(xp, singvals)
+    umat = (matrix @ eigvecs) * inv_s.reshape(1, -1)
+    return umat, singvals, _conj(xp, eigvecs).T
+
+
+def _batched_svd_eigh(xp, matrices):
+    """Batched EVD SVD for a stack of same-shape torch matrices."""
+
+    m_dim, n_dim = matrices.shape[-2:]
+    if m_dim <= n_dim:
+        grams = matrices @ _conj(xp, matrices).transpose(-1, -2)
+        eigvals, eigvecs = xp.linalg.eigh(grams)
+        idx = xp.argsort(eigvals, dim=-1, descending=True)
+        eigvals = xp.gather(eigvals, -1, idx)
+        eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
+        singvals = _sqrt_clamped(xp, eigvals)
+        inv_s = _safe_inverse(xp, singvals)
+        vhs = (eigvecs.conj().transpose(-1, -2) @ matrices) * inv_s.unsqueeze(-1)
+        return eigvecs, singvals, vhs
+
+    grams = _conj(xp, matrices).transpose(-1, -2) @ matrices
+    eigvals, eigvecs = xp.linalg.eigh(grams)
+    idx = xp.argsort(eigvals, dim=-1, descending=True)
+    eigvals = xp.gather(eigvals, -1, idx)
+    eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
+    singvals = _sqrt_clamped(xp, eigvals)
+    inv_s = _safe_inverse(xp, singvals)
+    umats = (matrices @ eigvecs) * inv_s.unsqueeze(-2)
+    return umats, singvals, eigvecs.conj().transpose(-1, -2)
+
+
+def _eigh(xp, matrix):
+    if xp is np:
+        return np.linalg.eigh(matrix)
+    return xp.linalg.eigh(matrix)
+
+
+def _sort_eigh_desc(xp, eigvals, eigvecs):
+    if xp is np:
+        idx = np.argsort(eigvals)[::-1].copy()
+        return eigvals[idx], eigvecs[:, idx]
+    idx = xp.argsort(eigvals, descending=True)
+    return eigvals[idx], eigvecs[:, idx]
+
+
+def _sqrt_clamped(xp, eigvals):
+    if xp is np:
+        return np.sqrt(np.maximum(eigvals.real, 0.0))
+    return xp.sqrt(xp.clamp(eigvals.real, min=0.0))
+
+
+def _safe_inverse(xp, values):
+    if xp is np:
+        return np.where(values > 1e-300, 1.0 / values, 0.0)
+    return xp.where(values > 1e-300, 1.0 / values, xp.zeros_like(values))
+
+
+@dataclass
+class VidalTEBDExecutor:
+    nqubits: int
+    max_bond: int
+    cut_ratio: float = 1e-12
+    tensor_module: str = "torch"
+    workers: int = 1
+    use_batched: bool = False
+
+    def __post_init__(self):
+        self.xp = _backend_module(self.tensor_module)
+        if self.xp is np:
+            self.dtype = np.complex128
+            self.device = None
+        else:
+            self.dtype = self.xp.complex128
+            self.device = self.xp.device("cpu")
+
+        self.gammas = []
+        for _ in range(self.nqubits):
+            tensor = _asarray(self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype)
+            self.gammas.append(tensor)
+        self.lambdas = [
+            _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
+        ]
+
+    def run_circuit(self, circuit):
+        for batch in _disjoint_batches(circuit.queue):
+            if (
+                self.use_batched
+                and _is_two_qubit_batch(batch)
+                and self.workers > 1
+                and len(batch) > 1
+            ):
+                self.apply_two_site_batch(batch)
+            else:
+                for gate in batch:
+                    self._apply_gate(gate)
+
+    def _apply_gate(self, gate):
+        sites = _gate_sites(gate)
+        matrix = _asarray(self.xp, gate.matrix(), self.dtype)
+        if len(sites) == 1:
+            self.apply_one_site(matrix, sites[0])
+        elif len(sites) == 2:
+            if abs(sites[0] - sites[1]) != 1:
+                raise NotImplementedError("VidalTEBDExecutor supports adjacent gates only.")
+            self.apply_two_site(matrix, sites[0], sites[1])
+        else:
+            raise NotImplementedError("Only one- and two-qubit gates are supported.")
+
+    def apply_one_site(self, op, pos):
+        # op[out_phys, in_phys] * gamma[left, in_phys, right]
+        self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos])
+
+    def apply_two_site(self, op, left_pos, right_pos):
+        item = self._build_two_site_matrix(op, left_pos, right_pos)
+        umat, singvals, vh = _svd(self.xp, item["matrix"])
+        self._install_two_site_split(item, umat, singvals, vh)
+
+    def apply_two_site_batch(self, batch):
+        items = []
+        for gate in batch:
+            sites = _gate_sites(gate)
+            op = _asarray(self.xp, gate.matrix(), self.dtype)
+            items.append(self._build_two_site_matrix(op, sites[0], sites[1]))
+
+        if self.xp is not np:
+            grouped = {}
+            for idx, item in enumerate(items):
+                grouped.setdefault(tuple(item["matrix"].shape), []).append(idx)
+
+            for indices in grouped.values():
+                if len(indices) == 1:
+                    item = items[indices[0]]
+                    umat, singvals, vh = _svd(self.xp, item["matrix"])
+                    item["split"] = (umat, singvals, vh)
+                    continue
+
+                matrices = self.xp.stack([items[idx]["matrix"] for idx in indices])
+                umats, singvals, vhs = _batched_svd_eigh(self.xp, matrices)
+                for out_idx, item_idx in enumerate(indices):
+                    items[item_idx]["split"] = (
+                        umats[out_idx],
+                        singvals[out_idx],
+                        vhs[out_idx],
+                    )
+        else:
+            for item in items:
+                item["split"] = _svd(self.xp, item["matrix"])
+
+        for item in items:
+            self._install_two_site_split(item, *item["split"])
+
+    def _build_two_site_matrix(self, op, left_pos, right_pos):
+        if left_pos > right_pos:
+            left_pos, right_pos = right_pos, left_pos
+            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(
+                4, 4
+            )
+
+        i = left_pos
+        lam_left = self.lambdas[i]
+        lam_mid = self.lambdas[i + 1]
+        lam_right = self.lambdas[i + 2]
+        gamma_left = self.gammas[i]
+        gamma_right = self.gammas[i + 1]
+
+        theta = self.xp.einsum(
+            "a,asb,b,btc,c->astc",
+            lam_left,
+            gamma_left,
+            lam_mid,
+            gamma_right,
+            lam_right,
+        )
+        gate = op.reshape(2, 2, 2, 2)
+        theta = self.xp.einsum("uvst,astc->auvc", gate, theta)
+
+        chi_left = theta.shape[0]
+        chi_right = theta.shape[3]
+        matrix = theta.reshape(chi_left * 2, 2 * chi_right)
+        return {
+            "site": i,
+            "chi_left": chi_left,
+            "chi_right": chi_right,
+            "lam_left": lam_left,
+            "lam_right": lam_right,
+            "matrix": matrix,
+        }
+
+    def _install_two_site_split(self, item, umat, singvals, vh):
+        keep = self._choose_bond(singvals)
+        umat = umat[:, :keep]
+        kept = singvals[:keep]
+        cut = singvals[keep:]
+        vh = vh[:keep, :]
+
+        if cut.shape[0] > 0:
+            norm_kept = (kept * kept).sum()
+            norm_cut = (cut * cut).sum()
+            kept = kept / self.xp.sqrt(norm_kept / (norm_kept + norm_cut))
+
+        new_left = umat.reshape(item["chi_left"], 2, keep)
+        new_right = vh.reshape(keep, 2, item["chi_right"])
+        new_left = self._divide_left_lambda(new_left, item["lam_left"])
+        new_right = self._divide_right_lambda(new_right, item["lam_right"])
+
+        i = item["site"]
+        self.gammas[i] = new_left
+        self.gammas[i + 1] = new_right
+        self.lambdas[i + 1] = kept
+
+    def _choose_bond(self, singvals):
+        max_possible = int(singvals.shape[0])
+        keep = min(max_possible, self.max_bond)
+        if self.cut_ratio > 0 and max_possible > 0:
+            threshold = singvals[0] * self.cut_ratio
+            if self.xp is np:
+                ratio_keep = int(np.count_nonzero(singvals > threshold))
+            else:
+                ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
+            keep = min(keep, max(1, ratio_keep))
+        return keep
+
+    def _divide_left_lambda(self, tensor, lambdas):
+        if self.xp is np:
+            safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
+        else:
+            safe = self.xp.where(
+                self.xp.abs(lambdas) > 1e-300,
+                lambdas,
+                self.xp.ones_like(lambdas),
+            )
+        return tensor / safe.reshape(-1, 1, 1)
+
+    def _divide_right_lambda(self, tensor, lambdas):
+        if self.xp is np:
+            safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
+        else:
+            safe = self.xp.where(
+                self.xp.abs(lambdas) > 1e-300,
+                lambdas,
+                self.xp.ones_like(lambdas),
+            )
+        return tensor / safe.reshape(1, 1, -1)
+
+    def expectation_ring_xz(self):
+        xmat = _asarray(self.xp, [[0, 1], [1, 0]], self.dtype)
+        zmat = _asarray(self.xp, [[1, 0], [0, -1]], self.dtype)
+        value = 0.0
+        for site in range(self.nqubits - 1):
+            value += 0.5 * _to_float(self._expect_adjacent(site, xmat, zmat))
+        value += 0.5 * _to_float(
+            self.expect_product_operators({self.nqubits - 1: xmat, 0: zmat})
+        )
+        return value / self.norm()
+
+    def _expect_adjacent(self, site, op_left, op_right):
+        theta = self.xp.einsum(
+            "a,asb,b,btc,c->astc",
+            self.lambdas[site],
+            self.gammas[site],
+            self.lambdas[site + 1],
+            self.gammas[site + 1],
+            self.lambdas[site + 2],
+        )
+        op_theta = self.xp.einsum("us,vt,astc->auvc", op_left, op_right, theta)
+        return _vdot_real(self.xp, theta, op_theta)
+
+    def expect_product_operators(self, operators):
+        env = _asarray(self.xp, [[1.0 + 0.0j]], self.dtype)
+        identity = _eye(self.xp, 2, self.dtype, self.device)
+        for site in range(self.nqubits):
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            op = operators.get(site, identity)
+            env = self.xp.einsum(
+                "xy,xsb,st,ytd->bd", env, _conj(self.xp, tensor), op, tensor
+            )
+        return env.reshape(-1)[0].real
+
+    def norm(self):
+        return _to_float(self.expect_product_operators({}))
+
+
+def _gate_sites(gate):
+    controls = tuple(getattr(gate, "control_qubits", ()))
+    targets = tuple(getattr(gate, "target_qubits", ()))
+    if controls:
+        return controls + targets
+    return targets
+
+
+def _disjoint_batches(gates):
+    batches = []
+    current = []
+    touched = set()
+    for gate in gates:
+        sites = _gate_sites(gate)
+        site_set = set(sites)
+        if current and touched & site_set:
+            batches.append(current)
+            current = []
+            touched = set()
+        current.append(gate)
+        touched |= site_set
+    if current:
+        batches.append(current)
+    return batches
+
+
+def _is_two_qubit_batch(batch):
+    return batch and all(len(_gate_sites(gate)) == 2 for gate in batch)
+
+
+def run_vidal_ring_xz(
+    circuit,
+    max_bond,
+    cut_ratio=1e-12,
+    tensor_module="torch",
+    workers=1,
+    use_batched=False,
+):
+    executor = VidalTEBDExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=max_bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        workers=workers,
+        use_batched=use_batched,
+    )
+    executor.run_circuit(circuit)
+    return executor.expectation_ring_xz()