numpy换为torch；修复torch后端计算方式不支持的问题，加速svd；发现并行化err，尝试修复；当前加速比11x,但是最新的并行方式不稳定，可能有精度问题

2026-05-10 22:33:46 +08:00
parent fea8e5abc0
commit aa122964b4
4 changed files with 638 additions and 58 deletions
--- a/baseline_mps_expectation.py
+++ b/baseline_mps_expectation.py
@@ -1,18 +1,17 @@
 """Baseline MPS expectation scan with the qmatchatea backend."""
 import argparse
 import json
 import logging
 import math
 import time
 import numpy as np
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Z
 from qibotn.backends.qmatchatea import QMatchaTeaBackend
-
+from qibotn.backends.vidal_tebd import run_vidal_ring_xz
 def parse_bonds(value):
    return [int(item) for item in value.split(",") if item.strip()]
 def build_circuit(nqubits, nlayers, seed):
@@ -33,9 +32,8 @@ def build_circuit(nqubits, nlayers, seed):
 def build_observable(nqubits):
    form = 0
-    for qubit in range(nqubits - 1):
+    for qubit in range(nqubits):
-        form += 0.5 * Z(qubit) * Z(qubit + 1)
+        form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
    form += 0.25 * X(0)
    return hamiltonians.SymbolicHamiltonian(form=form)
@@ -43,86 +41,123 @@ def exact_expectation(circuit, nqubits):
    import numpy as np
    state = circuit().state(numpy=True).reshape(-1)
    probabilities = np.abs(state) ** 2
    indices = np.arange(state.size)
    value = 0.0
-    for qubit in range(nqubits - 1):
+    chunk_size = 1 << 20
-        left = (indices >> (nqubits - 1 - qubit)) & 1
+    for qubit in range(nqubits):
-        right = (indices >> (nqubits - 2 - qubit)) & 1
+        next_qubit = (qubit + 1) % nqubits
-        value += 0.5 * np.sum(probabilities * (1 - 2 * left) * (1 - 2 * right))
+        x_flip = 1 << (nqubits - 1 - qubit)
-
+        z_shift = nqubits - 1 - next_qubit
-    flip_q0 = 1 << (nqubits - 1)
+        term = 0.0
-    value += 0.25 * np.vdot(state[indices ^ flip_q0], state).real
+        for start in range(0, state.size, chunk_size):
            stop = min(start + chunk_size, state.size)
            indices = np.arange(start, stop, dtype=np.int64)
            z_bit = (indices >> z_shift) & 1
            z_phase = 1 - 2 * z_bit
            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
        value += 0.5 * term
    return float(value)
 def main():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=20)
+    parser.add_argument("--nqubits", type=int, default=40)
-    parser.add_argument("--nlayers", type=int, default=8)
+    parser.add_argument("--nlayers", type=int, default=30)
-    parser.add_argument("--bonds", type=parse_bonds, default=parse_bonds("2,4,8,16,32"))
+    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--cut-ratio", type=float, default=1e-12)
+    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
-    parser.add_argument("--svd-control", default="V")
+    parser.add_argument("--torch-threads", type=int, default=32)
-    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="numpy")
+    parser.add_argument(
-    parser.add_argument("--torch-threads", type=int)
+        "--executor", choices=("qmatchatea", "vidal"), default="qmatchatea"
    )
    parser.add_argument("--vidal-workers", type=int, default=1)
    parser.add_argument("--vidal-batched", action="store_true")
    parser.add_argument("--mpi-ct", action="store_true")
    parser.add_argument("--mpi-barriers", type=int, default=-1)
    parser.add_argument("--mpi-isometrization", type=int, default=-1)
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
-    parser.add_argument("--preprocess", action="store_true")
+    parser.add_argument("--reference-file")
    parser.add_argument("--compile-circuit", action="store_true")
    parser.add_argument("--track-memory", action="store_true")
    args = parser.parse_args()
    logging.getLogger("qibo.config").setLevel(logging.ERROR)
    logging.getLogger("qtealeaves").setLevel(logging.ERROR)
    if args.torch_threads is not None:
    import torch
    torch.set_num_threads(args.torch_threads)
    rank = 0
    size = 1
    if args.mpi_ct:
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        size = MPI.COMM_WORLD.Get_size()
    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
    observable = build_observable(args.nqubits)
    exact = None
-    if args.exact:
+    if args.reference_file:
        with open(args.reference_file, "r", encoding="utf-8") as f:
            exact = float(json.load(f)["expectation"])
    elif args.exact:
        if args.nqubits > args.exact_max_qubits:
            raise ValueError(
                f"--exact is limited to {args.exact_max_qubits} qubits by default."
            )
        exact = exact_expectation(circuit, args.nqubits)
    if rank == 0:
        mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
        print(
            f"nqubits={args.nqubits} nlayers={args.nlayers} "
-        f"seed={args.seed} preprocess={args.preprocess} "
+            f"bond={args.bond} seed={args.seed} "
-        f"tensor_module={args.tensor_module}"
+            f"tensor_module={args.tensor_module} svd_control=E! "
            f"compile_circuit=True mpi={mpi_label} executor={args.executor} "
            f"vidal_workers={args.vidal_workers} vidal_batched={args.vidal_batched}"
        )
        if exact is not None:
            print(f"exact={exact:.16e}")
-    print("bond_dim expval abs_error rel_error seconds")
+        print("expval abs_error rel_error seconds")
    start = time.perf_counter()
    if args.executor == "vidal":
        if args.mpi_ct:
            raise ValueError("--executor vidal is a single-process executor.")
        value = run_vidal_ring_xz(
            circuit,
            max_bond=args.bond,
            cut_ratio=1e-12,
            tensor_module=args.tensor_module,
            workers=args.vidal_workers,
            use_batched=args.vidal_batched,
        )
    else:
        backend = QMatchaTeaBackend()
    for bond in args.bonds:
        backend.configure_tn_simulation(
            ansatz="MPS",
-            max_bond_dimension=bond,
+            max_bond_dimension=args.bond,
-            cut_ratio=args.cut_ratio,
+            cut_ratio=1e-12,
-            svd_control=args.svd_control,
+            svd_control="E!",
            tensor_module=args.tensor_module,
-            compile_circuit=args.compile_circuit,
+            compile_circuit=True,
-            track_memory=args.track_memory,
+            track_memory=False,
            mpi_approach="CT" if args.mpi_ct else "SR",
            mpi_num_procs=size,
            mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1,
            mpi_isometrization=args.mpi_isometrization,
        )
-        start = time.perf_counter()
+        value = backend.expectation(
        value = float(
            backend.expectation(
            circuit,
            observable,
-                preprocess=args.preprocess,
+            preprocess=False,
-                compile_circuit=args.compile_circuit,
+            compile_circuit=True,
            ).real
        )
    if rank != 0:
        return
    value = float(np.real(value))
    elapsed = time.perf_counter() - start
    abs_error = float("nan") if exact is None else abs(value - exact)
    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
-        print(f"{bond:d} {value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
+    print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
 if __name__ == "__main__":
--- a/qibojit_reference_expectation.py
+++ b/qibojit_reference_expectation.py
@@ -0,0 +1,109 @@
 """Compute and cache a qibojit state-vector reference for the ring-XZ observable."""
 import argparse
 import json
 import math
 import time
 from pathlib import Path
 import numpy as np
 import qibo
 from qibo import Circuit, gates
 def build_circuit(nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for _ in range(nlayers):
        for qubit in range(nqubits):
            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
        for qubit in range(0, nqubits - 1, 2):
            circuit.add(gates.CNOT(qubit, qubit + 1))
        for qubit in range(1, nqubits - 1, 2):
            circuit.add(gates.CNOT(qubit, qubit + 1))
    return circuit
 def ring_xz_expectation(state, nqubits, chunk_size):
    value = 0.0
    for qubit in range(nqubits):
        next_qubit = (qubit + 1) % nqubits
        x_flip = 1 << (nqubits - 1 - qubit)
        z_shift = nqubits - 1 - next_qubit
        term = 0.0
        for start in range(0, state.size, chunk_size):
            stop = min(start + chunk_size, state.size)
            indices = np.arange(start, stop, dtype=np.int64)
            z_bit = (indices >> z_shift) & 1
            z_phase = 1 - 2 * z_bit
            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
        value += 0.5 * term
    return float(value)
 def default_output_path(nqubits, nlayers, seed):
    return Path("references") / (
        f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json"
    )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=32)
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--output")
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--allow-large", action="store_true")
    parser.add_argument("--max-state-gb", type=float, default=32.0)
    parser.add_argument("--chunk-size", type=int, default=1 << 20)
    args = parser.parse_args()
    output = Path(args.output) if args.output else default_output_path(
        args.nqubits, args.nlayers, args.seed
    )
    if output.exists() and not args.force:
        with open(output, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(f"loaded {output}")
        print(f"expectation={float(data['expectation']):.16e}")
        return
    state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3)
    if state_gb > args.max_state_gb and not args.allow_large:
        raise MemoryError(
            f"Estimated state vector alone is {state_gb:.1f} GiB. "
            "Pass --allow-large after confirming the node has enough memory."
        )
    qibo.set_backend("qibojit")
    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
    start = time.perf_counter()
    state = circuit().state(numpy=True).reshape(-1)
    expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size)
    elapsed = time.perf_counter() - start
    data = {
        "backend": "qibojit",
        "observable": "0.5 * sum_i X_i Z_((i+1) mod n)",
        "nqubits": args.nqubits,
        "nlayers": args.nlayers,
        "seed": args.seed,
        "expectation": expectation,
        "seconds": elapsed,
        "state_vector_gib_estimate": state_gb,
    }
    output.parent.mkdir(parents=True, exist_ok=True)
    with open(output, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, sort_keys=True)
        f.write("\n")
    print(f"saved {output}")
    print(f"expectation={expectation:.16e}")
    print(f"seconds={elapsed:.3f}")
 if __name__ == "__main__":
    main()
--- a/src/qibotn/backends/qmatchatea.py
+++ b/src/qibotn/backends/qmatchatea.py
@@ -9,6 +9,7 @@ import qmatchatea
 import qtealeaves
 from qibo.backends import NumpyBackend
 from qibo.config import raise_error
 from qmatchatea.utils import MPISettings
 from qibotn.backends.abstract import QibotnBackend
 from qibotn.observables import check_observable
@@ -43,6 +44,10 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
        compile_circuit: bool = False,
        cache_gate_tensors: bool = True,
        track_memory: bool = False,
        mpi_approach: str = "SR",
        mpi_num_procs: int = 1,
        mpi_where_barriers: int = -1,
        mpi_isometrization: int = -1,
    ):
        """Configure TN simulation given Quantum Matcha Tea interface.
@@ -84,6 +89,12 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
        self.compile_circuit = compile_circuit
        self.cache_gate_tensors = cache_gate_tensors
        self.track_memory = track_memory
        self.mpi_settings = MPISettings(
            mpi_approach=mpi_approach,
            num_procs=mpi_num_procs,
            where_barriers=mpi_where_barriers,
            isometrization=mpi_isometrization,
        )
        if hasattr(self, "qmatchatea_backend"):
            self._setup_backend_specifics()
@@ -99,12 +110,12 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            else "Z" if self.precision == "double" else "A"
        )
        # TODO: once MPI is available for Python, integrate it here
        self.qmatchatea_backend = qmatchatea.QCBackend(
            precision=qmatchatea_precision,
            device=qmatchatea_device,
            ansatz=self.ansatz,
            tensor_module=self.tensor_module,
            mpi_settings=self.mpi_settings,
        )
        self.qmatchatea_backend.cache_gate_tensors = self.cache_gate_tensors
        self.qmatchatea_backend.track_memory = self.track_memory
@@ -254,6 +265,12 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            operators=operators,
        )
        if self.qmatchatea_backend.mpi_approach != "SR":
            from qtealeaves.tooling.mpisupport import MPI
            if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
                return np.nan
        return np.real(results.observables["custom_hamiltonian"])
    def _qibocirc_to_qiskitcirc(
--- a/src/qibotn/backends/vidal_tebd.py
+++ b/src/qibotn/backends/vidal_tebd.py
@@ -0,0 +1,419 @@
 """Vidal/TEBD MPS executor for layer-parallel circuit simulation.
 This module is intentionally small and focused on the circuit family used by the
 MPS benchmarks: one-qubit gates and adjacent two-qubit gates on a 1D chain.  It
 keeps the state in Vidal form, so gates acting on disjoint bonds can be applied
 in parallel without moving a global mixed-canonical center.
 """
 from __future__ import annotations
 from dataclasses import dataclass
 import numpy as np
 def _backend_module(tensor_module):
    if tensor_module == "torch":
        import torch
        return torch
    if tensor_module == "numpy":
        return np
    raise ValueError(f"Unsupported tensor module {tensor_module!r}.")
 def _asarray(xp, value, dtype):
    if xp is np:
        return np.asarray(value, dtype=dtype)
    return xp.as_tensor(value, dtype=dtype)
 def _ones(xp, size, dtype, device=None):
    if xp is np:
        return np.ones(size, dtype=np.float64 if dtype == np.complex128 else np.float32)
    real_dtype = xp.float64 if dtype == xp.complex128 else xp.float32
    return xp.ones(size, dtype=real_dtype, device=device)
 def _eye(xp, size, dtype, device=None):
    if xp is np:
        return np.eye(size, dtype=dtype)
    return xp.eye(size, dtype=dtype, device=device)
 def _conj(xp, tensor):
    return np.conjugate(tensor) if xp is np else tensor.conj()
 def _transpose(xp, tensor, axes):
    return np.transpose(tensor, axes) if xp is np else tensor.permute(*axes)
 def _vdot_real(xp, left, right):
    if xp is np:
        return np.vdot(left.reshape(-1), right.reshape(-1)).real
    return xp.vdot(left.reshape(-1), right.reshape(-1)).real
 def _to_float(x):
    if hasattr(x, "detach"):
        return float(x.detach().cpu().item())
    return float(x)
 def _svd(xp, matrix):
    return _svd_eigh(xp, matrix)
 def _svd_eigh(xp, matrix):
    """SVD through Hermitian eigendecomposition.
    This mirrors the E-style path that is fast for the benchmark matrices and
    avoids torch's slower general-purpose SVD for many small/medium splits.
    """
    m_dim, n_dim = matrix.shape
    if m_dim <= n_dim:
        gram = matrix @ _conj(xp, matrix).T
        eigvals, eigvecs = _eigh(xp, gram)
        eigvals, eigvecs = _sort_eigh_desc(xp, eigvals, eigvecs)
        singvals = _sqrt_clamped(xp, eigvals)
        inv_s = _safe_inverse(xp, singvals)
        vh = (_conj(xp, eigvecs).T @ matrix) * inv_s.reshape(-1, 1)
        return eigvecs, singvals, vh
    gram = _conj(xp, matrix).T @ matrix
    eigvals, eigvecs = _eigh(xp, gram)
    eigvals, eigvecs = _sort_eigh_desc(xp, eigvals, eigvecs)
    singvals = _sqrt_clamped(xp, eigvals)
    inv_s = _safe_inverse(xp, singvals)
    umat = (matrix @ eigvecs) * inv_s.reshape(1, -1)
    return umat, singvals, _conj(xp, eigvecs).T
 def _batched_svd_eigh(xp, matrices):
    """Batched EVD SVD for a stack of same-shape torch matrices."""
    m_dim, n_dim = matrices.shape[-2:]
    if m_dim <= n_dim:
        grams = matrices @ _conj(xp, matrices).transpose(-1, -2)
        eigvals, eigvecs = xp.linalg.eigh(grams)
        idx = xp.argsort(eigvals, dim=-1, descending=True)
        eigvals = xp.gather(eigvals, -1, idx)
        eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
        singvals = _sqrt_clamped(xp, eigvals)
        inv_s = _safe_inverse(xp, singvals)
        vhs = (eigvecs.conj().transpose(-1, -2) @ matrices) * inv_s.unsqueeze(-1)
        return eigvecs, singvals, vhs
    grams = _conj(xp, matrices).transpose(-1, -2) @ matrices
    eigvals, eigvecs = xp.linalg.eigh(grams)
    idx = xp.argsort(eigvals, dim=-1, descending=True)
    eigvals = xp.gather(eigvals, -1, idx)
    eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs))
    singvals = _sqrt_clamped(xp, eigvals)
    inv_s = _safe_inverse(xp, singvals)
    umats = (matrices @ eigvecs) * inv_s.unsqueeze(-2)
    return umats, singvals, eigvecs.conj().transpose(-1, -2)
 def _eigh(xp, matrix):
    if xp is np:
        return np.linalg.eigh(matrix)
    return xp.linalg.eigh(matrix)
 def _sort_eigh_desc(xp, eigvals, eigvecs):
    if xp is np:
        idx = np.argsort(eigvals)[::-1].copy()
        return eigvals[idx], eigvecs[:, idx]
    idx = xp.argsort(eigvals, descending=True)
    return eigvals[idx], eigvecs[:, idx]
 def _sqrt_clamped(xp, eigvals):
    if xp is np:
        return np.sqrt(np.maximum(eigvals.real, 0.0))
    return xp.sqrt(xp.clamp(eigvals.real, min=0.0))
 def _safe_inverse(xp, values):
    if xp is np:
        return np.where(values > 1e-300, 1.0 / values, 0.0)
    return xp.where(values > 1e-300, 1.0 / values, xp.zeros_like(values))
@dataclass
 class VidalTEBDExecutor:
    nqubits: int
    max_bond: int
    cut_ratio: float = 1e-12
    tensor_module: str = "torch"
    workers: int = 1
    use_batched: bool = False
    def __post_init__(self):
        self.xp = _backend_module(self.tensor_module)
        if self.xp is np:
            self.dtype = np.complex128
            self.device = None
        else:
            self.dtype = self.xp.complex128
            self.device = self.xp.device("cpu")
        self.gammas = []
        for _ in range(self.nqubits):
            tensor = _asarray(self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype)
            self.gammas.append(tensor)
        self.lambdas = [
            _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
        ]
    def run_circuit(self, circuit):
        for batch in _disjoint_batches(circuit.queue):
            if (
                self.use_batched
                and _is_two_qubit_batch(batch)
                and self.workers > 1
                and len(batch) > 1
            ):
                self.apply_two_site_batch(batch)
            else:
                for gate in batch:
                    self._apply_gate(gate)
    def _apply_gate(self, gate):
        sites = _gate_sites(gate)
        matrix = _asarray(self.xp, gate.matrix(), self.dtype)
        if len(sites) == 1:
            self.apply_one_site(matrix, sites[0])
        elif len(sites) == 2:
            if abs(sites[0] - sites[1]) != 1:
                raise NotImplementedError("VidalTEBDExecutor supports adjacent gates only.")
            self.apply_two_site(matrix, sites[0], sites[1])
        else:
            raise NotImplementedError("Only one- and two-qubit gates are supported.")
    def apply_one_site(self, op, pos):
        # op[out_phys, in_phys] * gamma[left, in_phys, right]
        self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos])
    def apply_two_site(self, op, left_pos, right_pos):
        item = self._build_two_site_matrix(op, left_pos, right_pos)
        umat, singvals, vh = _svd(self.xp, item["matrix"])
        self._install_two_site_split(item, umat, singvals, vh)
    def apply_two_site_batch(self, batch):
        items = []
        for gate in batch:
            sites = _gate_sites(gate)
            op = _asarray(self.xp, gate.matrix(), self.dtype)
            items.append(self._build_two_site_matrix(op, sites[0], sites[1]))
        if self.xp is not np:
            grouped = {}
            for idx, item in enumerate(items):
                grouped.setdefault(tuple(item["matrix"].shape), []).append(idx)
            for indices in grouped.values():
                if len(indices) == 1:
                    item = items[indices[0]]
                    umat, singvals, vh = _svd(self.xp, item["matrix"])
                    item["split"] = (umat, singvals, vh)
                    continue
                matrices = self.xp.stack([items[idx]["matrix"] for idx in indices])
                umats, singvals, vhs = _batched_svd_eigh(self.xp, matrices)
                for out_idx, item_idx in enumerate(indices):
                    items[item_idx]["split"] = (
                        umats[out_idx],
                        singvals[out_idx],
                        vhs[out_idx],
                    )
        else:
            for item in items:
                item["split"] = _svd(self.xp, item["matrix"])
        for item in items:
            self._install_two_site_split(item, *item["split"])
    def _build_two_site_matrix(self, op, left_pos, right_pos):
        if left_pos > right_pos:
            left_pos, right_pos = right_pos, left_pos
            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(
                4, 4
            )
        i = left_pos
        lam_left = self.lambdas[i]
        lam_mid = self.lambdas[i + 1]
        lam_right = self.lambdas[i + 2]
        gamma_left = self.gammas[i]
        gamma_right = self.gammas[i + 1]
        theta = self.xp.einsum(
            "a,asb,b,btc,c->astc",
            lam_left,
            gamma_left,
            lam_mid,
            gamma_right,
            lam_right,
        )
        gate = op.reshape(2, 2, 2, 2)
        theta = self.xp.einsum("uvst,astc->auvc", gate, theta)
        chi_left = theta.shape[0]
        chi_right = theta.shape[3]
        matrix = theta.reshape(chi_left * 2, 2 * chi_right)
        return {
            "site": i,
            "chi_left": chi_left,
            "chi_right": chi_right,
            "lam_left": lam_left,
            "lam_right": lam_right,
            "matrix": matrix,
        }
    def _install_two_site_split(self, item, umat, singvals, vh):
        keep = self._choose_bond(singvals)
        umat = umat[:, :keep]
        kept = singvals[:keep]
        cut = singvals[keep:]
        vh = vh[:keep, :]
        if cut.shape[0] > 0:
            norm_kept = (kept * kept).sum()
            norm_cut = (cut * cut).sum()
            kept = kept / self.xp.sqrt(norm_kept / (norm_kept + norm_cut))
        new_left = umat.reshape(item["chi_left"], 2, keep)
        new_right = vh.reshape(keep, 2, item["chi_right"])
        new_left = self._divide_left_lambda(new_left, item["lam_left"])
        new_right = self._divide_right_lambda(new_right, item["lam_right"])
        i = item["site"]
        self.gammas[i] = new_left
        self.gammas[i + 1] = new_right
        self.lambdas[i + 1] = kept
    def _choose_bond(self, singvals):
        max_possible = int(singvals.shape[0])
        keep = min(max_possible, self.max_bond)
        if self.cut_ratio > 0 and max_possible > 0:
            threshold = singvals[0] * self.cut_ratio
            if self.xp is np:
                ratio_keep = int(np.count_nonzero(singvals > threshold))
            else:
                ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
            keep = min(keep, max(1, ratio_keep))
        return keep
    def _divide_left_lambda(self, tensor, lambdas):
        if self.xp is np:
            safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
        else:
            safe = self.xp.where(
                self.xp.abs(lambdas) > 1e-300,
                lambdas,
                self.xp.ones_like(lambdas),
            )
        return tensor / safe.reshape(-1, 1, 1)
    def _divide_right_lambda(self, tensor, lambdas):
        if self.xp is np:
            safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
        else:
            safe = self.xp.where(
                self.xp.abs(lambdas) > 1e-300,
                lambdas,
                self.xp.ones_like(lambdas),
            )
        return tensor / safe.reshape(1, 1, -1)
    def expectation_ring_xz(self):
        xmat = _asarray(self.xp, [[0, 1], [1, 0]], self.dtype)
        zmat = _asarray(self.xp, [[1, 0], [0, -1]], self.dtype)
        value = 0.0
        for site in range(self.nqubits - 1):
            value += 0.5 * _to_float(self._expect_adjacent(site, xmat, zmat))
        value += 0.5 * _to_float(
            self.expect_product_operators({self.nqubits - 1: xmat, 0: zmat})
        )
        return value / self.norm()
    def _expect_adjacent(self, site, op_left, op_right):
        theta = self.xp.einsum(
            "a,asb,b,btc,c->astc",
            self.lambdas[site],
            self.gammas[site],
            self.lambdas[site + 1],
            self.gammas[site + 1],
            self.lambdas[site + 2],
        )
        op_theta = self.xp.einsum("us,vt,astc->auvc", op_left, op_right, theta)
        return _vdot_real(self.xp, theta, op_theta)
    def expect_product_operators(self, operators):
        env = _asarray(self.xp, [[1.0 + 0.0j]], self.dtype)
        identity = _eye(self.xp, 2, self.dtype, self.device)
        for site in range(self.nqubits):
            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
            op = operators.get(site, identity)
            env = self.xp.einsum(
                "xy,xsb,st,ytd->bd", env, _conj(self.xp, tensor), op, tensor
            )
        return env.reshape(-1)[0].real
    def norm(self):
        return _to_float(self.expect_product_operators({}))
 def _gate_sites(gate):
    controls = tuple(getattr(gate, "control_qubits", ()))
    targets = tuple(getattr(gate, "target_qubits", ()))
    if controls:
        return controls + targets
    return targets
 def _disjoint_batches(gates):
    batches = []
    current = []
    touched = set()
    for gate in gates:
        sites = _gate_sites(gate)
        site_set = set(sites)
        if current and touched & site_set:
            batches.append(current)
            current = []
            touched = set()
        current.append(gate)
        touched |= site_set
    if current:
        batches.append(current)
    return batches
 def _is_two_qubit_batch(batch):
    return batch and all(len(_gate_sites(gate)) == 2 for gate in batch)
 def run_vidal_ring_xz(
    circuit,
    max_bond,
    cut_ratio=1e-12,
    tensor_module="torch",
    workers=1,
    use_batched=False,
 ):
    executor = VidalTEBDExecutor(
        nqubits=circuit.nqubits,
        max_bond=max_bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        workers=workers,
        use_batched=use_batched,
    )
    executor.run_circuit(circuit)
    return executor.expectation_ring_xz()