代码封装

2026-05-18 22:58:57 +08:00
parent eed42dcfa9
commit f93c95b3a1
56 changed files with 3414 additions and 5849 deletions
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
@@ -1573,6 +1573,23 @@ def _combine_1q_gate_run(gates, array_fn=None):
    return Gate.from_raw(G, gates[0].qubits)
 def _combine_2q_gate_run(gates, array_fn=None):
    """Combine a run of two qubit gates in application order."""
    gates = tuple(gate for _, gate in gates)
    G = gates[0].array
    if array_fn is not None:
        G = array_fn(G)
    G = reshape(G, (4, 4))
    for gate in gates[1:]:
        Gi = gate.array
        if array_fn is not None:
            Gi = array_fn(Gi)
        G = reshape(Gi, (4, 4)) @ G
    return Gate.from_raw(reshape(G, (2, 2, 2, 2)), gates[0].qubits)
 def _can_merge_1q_gate(gate):
    return (
        (gate.controls is None)
@@ -1583,48 +1600,96 @@ def _can_merge_1q_gate(gate):
    )
-def _iter_gates_with_merged_1q_runs(gates):
+def _can_merge_2q_gate(gate):
    return (
        (gate.controls is None)
        and (not gate.special)
        and (not gate.parametrize)
        and (gate.qubits is not None)
        and (len(gate.qubits) == 2)
    )
 def _iter_gates_with_merged_runs(gates, merge_1q=True, merge_2q=True):
    """Yield ``(gate_to_apply, gates_to_record)``, merging adjacent runs of
-    single qubit gates that are not interrupted by any operation touching the
+    local gates that are not interrupted by any operation touching the same
-    same qubit.
+    qubits.
    """
-    pending = {}
+    pending_1q = {}
    pending_2q = {}
    def flush_qubit(q):
-        run = pending.pop(q, None)
+        run = pending_1q.pop(q, None)
        if run is None:
            return
        if len(run) == 1:
            return run[0][1], run
        return None, run
    def flush_pair(pair):
        run = pending_2q.pop(pair, None)
        if run is None:
            return
        if len(run) == 1:
            return run[0][1], run
        return None, run
    def flush_touched(touched, keep_qubit=None, keep_pair=None):
        for q in tuple(pending_1q):
            if q == keep_qubit:
                continue
            if q in touched:
                item = flush_qubit(q)
                if item is not None:
                    yield item
        for pair in tuple(pending_2q):
            if pair == keep_pair:
                continue
            if touched.intersection(pair):
                item = flush_pair(pair)
                if item is not None:
                    yield item
    def flush_all():
-        for q in tuple(pending):
+        for q in tuple(pending_1q):
            item = flush_qubit(q)
            if item is not None:
                yield item
        for pair in tuple(pending_2q):
            item = flush_pair(pair)
            if item is not None:
                yield item
    for i, gate in enumerate(gates):
-        if _can_merge_1q_gate(gate):
+        if merge_1q and _can_merge_1q_gate(gate):
            (q,) = gate.qubits
-            pending.setdefault(q, []).append((i, gate))
+            yield from flush_touched({q}, keep_qubit=q)
            pending_1q.setdefault(q, []).append((i, gate))
            continue
        if merge_2q and _can_merge_2q_gate(gate):
            pair = gate.qubits
            yield from flush_touched(set(pair), keep_pair=pair)
            pending_2q.setdefault(pair, []).append((i, gate))
            continue
        touched = set(gate.qubits or ())
        if gate.controls:
            touched.update(gate.controls)
-        for q in tuple(pending):
+        yield from flush_touched(touched)
            if q in touched:
                item = flush_qubit(q)
                if item is not None:
                    yield item
        yield gate, ((i, gate),)
    yield from flush_all()
 _iter_gates_with_merged_1q_runs = functools.partial(
    _iter_gates_with_merged_runs, merge_1q=True, merge_2q=False
 )
 # --------------------------- main circuit class ---------------------------- #
@@ -2103,6 +2168,24 @@ class Circuit:
        self._psi.gate_(G, gates[0][1].qubits, tags=tags, **opts)
    def _apply_merged_2q_gate_run(self, gates, gate_number_offset=0, **gate_opts):
        tags = tags_to_oset(gate_opts.pop("tags", None))
        for i, gate in gates:
            tags |= self._gate_tags_for_record(
                gate, gate_number=gate_number_offset + i
            )
        opts = {**self.gate_opts, **gate_opts}
        if self.convert_eager:
            G = _combine_2q_gate_run(
                gates, array_fn=self._maybe_convert_gate_array
            ).array
        else:
            G = _combine_2q_gate_run(gates).array
        self._psi.gate_(G, gates[0][1].qubits, tags=tags, **opts)
    def apply_gate(
        self,
        gate_id,
@@ -2178,11 +2261,14 @@ class Circuit:
            Supplied to :meth:`~quimb.tensor.circuit.Circuit.apply_gate`.
        """
        merge_1q = gate_opts.pop("merge_1q", "auto")
        merge_2q = gate_opts.pop("merge_2q", "auto")
        if merge_1q == "auto":
            merge_1q = True
        if merge_2q == "auto":
            merge_2q = True
-        if merge_1q:
+        if merge_1q or merge_2q:
            gates = tuple(
                gate if isinstance(gate, Gate) else parse_to_gate(gate)
                for gate in gates
@@ -2195,15 +2281,22 @@ class Circuit:
                pbar = _progbar(total=len(gates))
            gate_number_offset = len(self._gates)
-            for gate, gates_to_record in _iter_gates_with_merged_1q_runs(
+            for gate, gates_to_record in _iter_gates_with_merged_runs(
-                gates
+                gates, merge_1q=merge_1q, merge_2q=merge_2q
            ):
                if gate is None:
-                    self._apply_merged_1q_gate_run(
+                    if len(gates_to_record[0][1].qubits) == 1:
-                        gates_to_record,
+                        self._apply_merged_1q_gate_run(
-                        gate_number_offset=gate_number_offset,
+                            gates_to_record,
-                        **gate_opts,
+                            gate_number_offset=gate_number_offset,
-                    )
+                            **gate_opts,
                        )
                    else:
                        self._apply_merged_2q_gate_run(
                            gates_to_record,
                            gate_number_offset=gate_number_offset,
                            **gate_opts,
                        )
                else:
                    self._apply_gate(
                        gate,
@@ -4892,11 +4985,16 @@ class CircuitMPS(Circuit):
    def apply_gates(self, gates, progbar=False, **gate_opts):
        merge_1q = gate_opts.pop("merge_1q", "auto")
        merge_2q = gate_opts.pop("merge_2q", "auto")
        if merge_1q == "auto":
            merge_1q = True
        if merge_2q == "auto":
            # MPS truncation semantics are sensitive to when a 2q gate is
            # materialized, so keep the default conservative here.
            merge_2q = False
-        if merge_1q:
+        if merge_1q or merge_2q:
            gates = tuple(
                gate if isinstance(gate, Gate) else parse_to_gate(gate)
                for gate in gates
@@ -4913,15 +5011,22 @@ class CircuitMPS(Circuit):
                )
            gate_number_offset = len(self._gates)
-            for gate, gates_to_record in _iter_gates_with_merged_1q_runs(
+            for gate, gates_to_record in _iter_gates_with_merged_runs(
-                gates
+                gates, merge_1q=merge_1q, merge_2q=merge_2q
            ):
                if gate is None:
-                    self._apply_merged_1q_gate_run(
+                    if len(gates_to_record[0][1].qubits) == 1:
-                        gates_to_record,
+                        self._apply_merged_1q_gate_run(
-                        gate_number_offset=gate_number_offset,
+                            gates_to_record,
-                        **gate_opts,
+                            gate_number_offset=gate_number_offset,
-                    )
+                            **gate_opts,
                        )
                    else:
                        self._apply_merged_2q_gate_run(
                            gates_to_record,
                            gate_number_offset=gate_number_offset,
                            **gate_opts,
                        )
                    gate_for_progress = gates_to_record[-1][1]
                else:
                    self._apply_gate(
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
@@ -5050,8 +5050,6 @@ class TNLinearOperator1D(spla.LinearOperator):
        if self.is_conj:
            T = T.conj()
        print(T)
        assert(0)
        return T.to_dense(self.left_inds, self.right_inds)
    def toarray(self):
--- a/README.md
+++ b/README.md
@@ -28,15 +28,24 @@ Currently, the supported tensor network libraries are:
 ## CPU expectation benchmarks
-The current CPU expectation entrypoint is:
+Use the library APIs directly:
-```sh
+```py
-python -u benchmark_cpu_expectation.py --ansatz mps --nqubits 40 --nlayers 10 --bond 2048 --circuits brickwall_cnot --observables ring_xz
+import qibotn
 records = qibotn.run_cpu_benchmark_cases(
    ansatz="mps",
    nqubits=40,
    nlayers=10,
    bond=2048,
    circuits=("brickwall_cnot",),
    observables=("ring_xz",),
 )
 ```
-Use `--ansatz tn` for the generic TN path and `--mpi` under `mpiexec` for MPI runs.
+For generic TN use `ansatz="tn"`.  Contest/custom runners are available as
-Reusable circuit and observable builders live in `src/qibotn/benchmark_cases.py`; execution logic lives in `src/qibotn/expectation_runner.py`.
+`qibotn.run_contest_tn_case`, `qibotn.run_custom_tn_expectation`,
-For Vidal/MPS 1D-chain scale tests, use `run_vidal_mps_cases.sh`.
+`qibotn.run_contest_mps_case`, and `qibotn.run_vidal_validation_cases`.
 ## Installation
--- a/benchmark_cpu_expectation.py
+++ b/benchmark_cpu_expectation.py
@@ -1,285 +0,0 @@
 """CLI for CPU TN/MPS expectation benchmarks."""
 from __future__ import annotations
 import argparse
 import os
 import subprocess
 from pathlib import Path
 from urllib.parse import urlparse
 from qibotn.benchmark_cases import (
    CIRCUITS,
    OBSERVABLES,
    build_circuit,
    observable_terms,
    parse_names,
    terms_to_dict,
 )
 from qibotn.expectation_runner import (
    ExpectationConfig,
    exact_for_observable,
    run_cpu_expectation,
 )
 def optional_int(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return int(text)
 def optional_float(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return float(text)
 def format_optional(value, fmt="g"):
    return "None" if value is None else format(value, fmt)
 def should_stop_dask(args):
    return (
        not args.keep_dask
        and args.tn_search_backend == "dask"
        and args.dask_address is not None
        and args.tn_load_tree is None
    )
 def stop_dask_cluster(args, rank):
    if rank != 0 or not should_stop_dask(args):
        return
    script = Path(__file__).resolve().parent / "tools" / "manage_tn_dask_cluster.sh"
    if not script.exists():
        print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
        return
    env = os.environ.copy()
    parsed = urlparse(args.dask_address)
    if parsed.hostname:
        env.setdefault("SCHEDULER_HOST", parsed.hostname)
    if parsed.port:
        env.setdefault("SCHEDULER_PORT", str(parsed.port))
    print("dask_stop_after_search start", flush=True)
    subprocess.run([str(script), "stop"], cwd=str(script.parent.parent), env=env, check=False)
    print("dask_stop_after_search done", flush=True)
 def build_parallel_opts(args):
    slicing_opts = {}
    if args.tn_target_slices is not None:
        slicing_opts["target_slices"] = args.tn_target_slices
    if args.tn_target_size is not None:
        slicing_opts["target_size"] = args.tn_target_size
    opts = {
        "slicing_opts": slicing_opts or None,
        "search_workers": args.tn_search_workers or args.torch_threads,
        "max_repeats": args.tn_search_repeats,
        "max_time": args.tn_search_time,
        "print_stats": not args.no_tn_stats,
    }
    if args.tn_search_backend is not None:
        opts["search_backend"] = args.tn_search_backend
    if args.dask_address is not None:
        opts["dask_address"] = args.dask_address
    if args.tn_save_tree is not None:
        opts["save_tree_path"] = args.tn_save_tree
    if args.tn_load_tree is not None:
        opts["load_tree_path"] = args.tn_load_tree
    if args.tn_search_only:
        opts["search_only"] = True
    if args.tn_debug_trials:
        opts["debug_trials"] = True
    if args.tn_contract_implementation is not None:
        opts["contract_implementation"] = args.tn_contract_implementation
    if args.dask_close_workers:
        opts["dask_close_workers"] = True
    return opts
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=40)
    parser.add_argument("--nlayers", type=int, default=30)
    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--torch-threads", type=int, default=8)
    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
    parser.add_argument(
        "--dtype",
        choices=("complex128", "complex64"),
        default="complex128",
    )
    parser.add_argument("--ansatz", choices=("tn", "mps"), default=None)
    parser.add_argument("--mps", action="store_true")
    parser.add_argument("--mpi", action="store_true")
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
    parser.add_argument("--circuits", nargs="+", default=["brickwall_cnot"])
    parser.add_argument("--observables", nargs="+", default=["ring_xz"])
    parser.add_argument("--pauli-pattern")
    parser.add_argument("--tn-target-slices", type=int)
    parser.add_argument("--tn-target-size", type=int,default=2**32)
    parser.add_argument("--tn-search-workers", type=int)
    parser.add_argument("--tn-search-repeats", type=int, default=128)
    parser.add_argument("--tn-search-time", type=float, default=60.0)
    parser.add_argument(
        "--no-tn-stats",
        action="store_true",
        help="Do not print per-term TN search/contraction diagnostics.",
    )
    parser.add_argument(
        "--tn-search-backend",
        choices=("processpool", "dask"),
        default="dask",
        help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.",
    )
    parser.add_argument(
        "--dask-address",
        help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.",
    )
    parser.add_argument(
        "--dask-close-workers",
        action="store_true",
        help="After dask path search, ask the scheduler to close all currently connected workers.",
    )
    parser.add_argument(
        "--keep-dask",
        action="store_true",
        help=(
            "Keep an external dask cluster running after search. By default, "
            "tools/manage_tn_dask_cluster.sh stop is called after search when "
            "--dask-address is used."
        ),
    )
    parser.add_argument(
        "--tn-save-tree",
        help="Save searched cotengra contraction tree(s) to this pickle file.",
    )
    parser.add_argument(
        "--tn-load-tree",
        help="Load cotengra contraction tree(s) from this pickle file and skip path search.",
    )
    parser.add_argument(
        "--tn-search-only",
        action="store_true",
        help="Only run path search and optional --tn-save-tree; skip contraction.",
    )
    parser.add_argument(
        "--tn-debug-trials",
        action="store_true",
        help="Print dask worker summary and per-trial worker start/done logs.",
    )
    parser.add_argument(
        "--tn-contract-implementation",
        choices=("auto", "cotengra", "autoray", "cpp"),
        help="cotengra contraction implementation for TN contraction.",
    )
    args = parser.parse_args()
    ansatz = "mps" if args.mps else (args.ansatz or "tn")
    circuits = parse_names(args.circuits, CIRCUITS, "circuits")
    observables = [] if args.pauli_pattern else parse_names(
        args.observables, OBSERVABLES, "observables"
    )
    rank = 0
    if args.mpi:
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
    config = ExpectationConfig(
        ansatz=ansatz,
        mpi=args.mpi,
        bond=args.bond,
        cut_ratio=args.cut_ratio,
        tensor_module="torch",
        quimb_backend=args.quimb_backend,
        dtype=args.dtype,
        torch_threads=args.torch_threads,
        parallel_opts=build_parallel_opts(args),
    )
    if rank == 0:
        mode = "MPI" if args.mpi else "serial"
        print(
            f"backend=cpu ansatz={ansatz.upper()} mode={mode} "
            f"nqubits={args.nqubits} nlayers={args.nlayers} "
            f"bond={format_optional(args.bond)} "
            f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
            f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
            f"torch_threads={args.torch_threads} "
            f"tn_search_backend={args.tn_search_backend}"
        )
        print("circuit observable exact value abs_error rel_error seconds")
    try:
        for circuit_kind in circuits:
            circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
            named_observables = (
                [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})]
                if args.pauli_pattern
                else [
                    (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits)))
                    for obs_kind in observables
                ]
            )
            for obs_name, observable in named_observables:
                exact = None
                if args.exact and rank == 0:
                    if args.nqubits > args.exact_max_qubits:
                        raise ValueError(
                            f"--exact is limited to {args.exact_max_qubits} qubits by default."
                        )
                    exact = exact_for_observable(circuit, observable, args.nqubits)
                result = run_cpu_expectation(circuit, observable, config)
                if args.mpi and result.rank != 0:
                    continue
                abs_error = float("nan") if exact is None else abs(result.value - exact)
                rel_error = (
                    float("nan")
                    if exact is None
                    else abs_error / max(abs(exact), 1e-15)
                )
                exact_text = "nan" if exact is None else f"{exact:.16e}"
                print(
                    f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
                    f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
                )
                for stat in result.parallel_stats or ():
                    cost = stat["path_cost"]
                    search_stats = stat.get("search_stats", {})
                    print(
                        "tn_term_summary "
                        f"term={stat.get('term_index', 0)} "
                        f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
                        f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
                        f"completed_trials={search_stats.get('completed_trials', 'na')} "
                        f"finite_trials={search_stats.get('finite_trials', 'na')} "
                        f"failed_trials={search_stats.get('failed_trials', 'na')} "
                        f"requested_trials={search_stats.get('requested_trials', 'na')} "
                        f"best_score={search_stats.get('best_score', float('nan')):.6g} "
                        f"slices={cost['nslices']} "
                        f"log10_flops={cost['log10_flops']:.3f} "
                        f"log10_write={cost['log10_write']:.3f} "
                        f"log2_size={cost['log2_size']:.3f} "
                        f"log10_combo={cost['log10_combo']:.3f} "
                        f"peak_memory_gib={cost['peak_memory_gib']:.6g} "
                        f"slicing_overhead={cost['slicing_overhead']:.6g} "
                        f"rank_slices={stat.get('rank_slices', 'na')}"
                    )
    finally:
        stop_dask_cluster(args, rank)
 if __name__ == "__main__":
    main()
--- a/docs/contest_runners.md
+++ b/docs/contest_runners.md
@@ -1,88 +1,12 @@
-# TN
+# Contest Runners
 ```bash
 # search + contract，Open MPI 多节点：每节点 2 rank，每 rank 绑定 1 个 NUMA。
 # MPI_HOSTS 里每个节点写 :2，MPI_RANKS = 节点数 * 2。
 # 每个 rank 使用 MPI_PE 个 core；这台 2-NUMA AMD 节点用 MPI_PE=128。
-NQUBITS=40 \
+The reusable implementations live in `src/qibotn/backends/`.
 TN_DEBUG_TRIALS=1 \
 SCHEDULER_HOST=10.20.1.100 \
 DASK_ADDRESS=tcp://10.20.1.100:8786 \
 WORKER_HOSTS="10.20.1.100 10.20.1.101 10.20.1.102 10.20.1.103" \
 CASE=main1 \
 OBSERVABLES=long_z_string \
 TORCH_THREADS=80 \
 MPI_PE=80 \
 MPI_MAP_BY=ppr:1:numa:PE=80 \
 MPI_BIND_TO=core \
 OMP_NUM_THREADS=80 \
 MKL_NUM_THREADS=80 \
 BLIS_NUM_THREADS=80 \
 MPI_HOSTS="node-0:2,node-1:2,node-2:2,node-3:2" \
 MPI_RANKS=8 \
 NWORKERS=96 \
 TN_TARGET_SIZE=17179869184 \
 tools/run_tn_dask_mpi_all.sh
-# 单独缩并contract计算
+- `qibotn.run_contest_tn_case`: quimb+torch TN search/contract cases.
 - `qibotn.run_contest_mps_case`: Vidal/MPS contest expectation cases.
 - `qibotn.run_vidal_mpi_contest_case`: direct Vidal MPI observable sweep.
 - `qibotn.run_custom_tn_expectation`: custom quimb+torch TN cases.
-mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
+`src/qibotn/backends/quimb.py` holds the TN helpers,
-  -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
+`src/qibotn/backends/qmatchatea.py` holds the qmatchatea MPS helpers,
-  -x BLIS_NUM_THREADS=80 \
+and `src/qibotn/backends/vidal.py` holds the Vidal helpers.
  -x OMP_NUM_THREADS=80 \
  -x MKL_NUM_THREADS=80 \
  -x OMP_PROC_BIND=close \
  -x OMP_PLACES=cores \
  -np 8 \
  -host node-0:2,node-1:2,node-2:2,node-3:2 \
  .venv/bin/python -u tools/tn_contest_runner.py contract \
  --mpi \
  --case main1 \
  --nqubits 34 \
  --nlayers 20 \
  --observables long_z_string \
  --tree-dir trees/contest_tn \
  --torch-threads 80 \
  --dtype complex64
 ```
 # MPS
 ```
 cd /home/qibo/qibotn
 MPIEXEC=mpirun \
 MPI_HOSTS="node-2:4,node-3:4" \
 MPI_RANKS=8 \
 MPI_PE=48 \
 MPI_MAP_BY=ppr:2:numa:PE=48 \
 MPI_BIND_TO=core \
 MPI_REPORT_BINDINGS=1 \
 TORCH_THREADS=48 \
 OMP_NUM_THREADS=48 \
 MKL_NUM_THREADS=48 \
 BLIS_NUM_THREADS=48 \
 OBS_FILTER=ring_xz \
 MAIN1_NQ=128 \
 MAIN1_LAYERS=24 \
 MAIN1_BOND=1024 \
 tools/run_vidal_mpi_contest_cases.sh main1
 MPIEXEC=mpirun \
 MPI_HOSTS="node-2:4" \
 MPI_RANKS=4 \
 MPI_PE=48 \
 MPI_MAP_BY=ppr:2:numa:PE=48 \
 MPI_BIND_TO=core \
 MPI_REPORT_BINDINGS=1 \
 TORCH_THREADS=48 \
 OMP_NUM_THREADS=48 \
 MKL_NUM_THREADS=48 \
 BLIS_NUM_THREADS=48 \
 OBS_FILTER=ring_xz \
 MAIN1_NQ=128 \
 MAIN1_LAYERS=24 \
 MAIN1_BOND=1024 \
 tools/run_vidal_mpi_contest_cases.sh main1
 ```
--- a/docs/home.md
+++ b/docs/home.md
@@ -0,0 +1,26 @@
 # qibotn
 Core reusable code lives under `src/qibotn/`. Prefer importing from `qibotn`
 or `qibotn.backends.*`; benchmark and runner helpers have been folded into the
 package instead of being kept as standalone scripts.
 - `backends/quimb.py`: TN + torch helpers for quimb.
 - `backends/qmatchatea.py`: qmatchatea + torch MPS helpers.
 - `backends/vidal.py`: Vidal + torch helpers.
 - `contest_cases.py`: shared contest circuits, observables, and case specs.
 - `torch_utils.py`: shared torch array/thread helpers.
 Quimb TN reusable entrypoints include `build_quimb_backend_circuit`,
 `build_expectation_tn`, `run_quimb_torch_expectation`,
 `compare_quimb_gate_merge`, `compare_quimb_gate_merge_expectation`,
 `profile_quimb_torch_expectation`, and `time_quimb_contract_implementations`.
 Common public imports include `qibotn.cpu_expectation`,
 `qibotn.mps_expectation`, `qibotn.run_qmatchatea_expectation`,
 `qibotn.run_vidal_expectation`, `qibotn.build_contest_circuit`, and
 `qibotn.build_contest_observable`.
 Former script entrypoints are available as importable functions:
 `qibotn.run_cpu_benchmark_cases`, `qibotn.run_contest_tn_case`,
 `qibotn.run_custom_tn_expectation`, `qibotn.run_contest_mps_case`,
 `qibotn.run_vidal_mpi_contest_case`, and `qibotn.run_vidal_validation_cases`.
--- a/docs/xianchang.md
+++ b/docs/xianchang.md
@@ -1,42 +0,0 @@
 mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
  -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
  -x BLIS_NUM_THREADS=80 \
  -x OMP_NUM_THREADS=80 \
  -x MKL_NUM_THREADS=80 \
  -x OMP_PROC_BIND=close \
  -x OMP_PLACES=cores \
  -np 4 \
  -host node-0:2,node-1:2,node-2:2,node-3:2 \
  .venv/bin/python -u tools/tn_contest_runner.py contract \
  --mpi \
  --case main1 \
  --nqubits 34 \
  --nlayers 20 \
  --observables long_z_string \
  --tree-dir trees/contest_tn \
  --torch-threads 80 \
  --dtype complex64
 SEARCH_TIME=300  NQUBITS=40 TN_DEBUG_TRIALS=1 SCHEDULER_HOST=10.20.1.102 DASK_ADDRESS=tcp://10.20.1.102:8786 WORKER_HOSTS="10.20.1.102 10.20.1.103" CASE=main1 OBSERVABLES=long_z_string TORCH_THREADS=80 MPI_PE=80 MPI_MAP_BY=ppr:1:numa:PE=80 MPI_BIND_TO=core OMP_NUM_THREADS=80 MKL_NUM_THREADS=80 BLIS_NUM_THREADS=80 MPI_HOSTS="node-2:2,node-3:2" MPI_RANKS=4 NWORKERS=128  TN_TARGET_SIZE=17179869184 tools/run_tn_dask_mpi_all.sh
 NQUBITS=40 \
 TN_DEBUG_TRIALS=1 \
 SCHEDULER_HOST=10.20.1.102 \
 DASK_ADDRESS=tcp://10.20.1.102:8786 \
 WORKER_HOSTS="10.20.1.102 10.20.1.103" \
 CASE=main1 \
 OBSERVABLES=long_z_string \
 TORCH_THREADS=80 \
 MPI_PE=80 \
 MPI_MAP_BY=ppr:1:numa:PE=80 \
 MPI_BIND_TO=core \
 OMP_NUM_THREADS=80 \
 MKL_NUM_THREADS=80 \
 BLIS_NUM_THREADS=80 \
 MPI_HOSTS="node-2:2,node-3:2" \
 MPI_RANKS=4 \
 NWORKERS=96 \
 TN_TARGET_SIZE=17179869184 \
 tools/run_tn_dask_mpi_all.sh
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,7 +60,7 @@ mpmath==1.3.0
 msgpack==1.1.2
 networkx==3.6.1
 numba==0.61.2
-numpy==2.0.1
+numpy @ file:///home/yx/numpy
 openqasm3==1.0.1
 opt_einsum==3.4.0
 optuna==4.8.0
@@ -93,7 +93,7 @@ python-multipart==0.0.26
 PyYAML==6.0.3
 qibo==0.3.2
 qibojit==0.1.15
-e git+https://git.nudt.space/jaunatisblue/qibotn.git@4c7a10d026d514897dcc501b507fa604fb4e52d4#egg=qibotn
+-e git+https://git.nudt.space/jaunatisblue/qibotn.git@eed42dcfa9739c609a58f7367fe403abf2e992a9#egg=qibotn
 qiskit==1.4.5
 qmatchatea==1.5.8
 qredtea==0.3.15
@@ -106,7 +106,7 @@ regex==2026.4.4
 requests==2.33.1
 rpds-py==0.30.0
 rustworkx==0.17.1
-scipy==1.17.1
+scipy @ file:///home/yx/scipy
 setuptools==70.2.0
 six==1.17.0
 sniffio==1.3.1
@@ -118,13 +118,15 @@ stack-data==0.6.3
 starlette==1.0.0
 stevedore==5.7.0
 symengine==0.13.0
-sympy==1.13.1
+sympy==1.14.0
 tabulate==0.9.0
 tblib==3.2.2
 texttable==1.7.0
 threadpoolctl==3.6.0
 toolz==1.1.0
-torch @ file:///home/qibo/qibotn/wheels/torch-2.10.0a0+a36e1d3-cp312-cp312-linux_x86_64.whl
+torch==2.11.0+cpu
 torchaudio==2.11.0+cpu
 torchvision==0.26.0+cpu
 tornado==6.5.5
 tqdm==4.67.3
 traitlets==5.14.3
@@ -135,4 +137,3 @@ uvicorn==0.46.0
 wcwidth==0.6.0
 webencodings==0.5.1
 zict==3.0.0
--- a/run_vidal_mps_cases.sh
+++ b/run_vidal_mps_cases.sh
@@ -1,135 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Focused Vidal/MPS expectation test cases for 1D chain circuits.
 #
 # These cases intentionally avoid qmatchatea and generic TN paths.  They target
 # the current supported scope: one-qubit gates, adjacent two-qubit gates, and
 # Pauli-sum expectation values on a 1D chain.
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$ROOT_DIR"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 MPIEXEC="${MPIEXEC:-mpiexec}"
 HOSTFILE="${HOSTFILE:-hostfile}"
 THREADS="${THREADS:-32}"
 MPI_RANKS="${MPI_RANKS:-16}"
 MPI_THREADS="${MPI_THREADS:-12}"
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
 source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 run() {
  echo
  echo "--------------------------------------------------------------------------------"
  echo "$*"
  echo "--------------------------------------------------------------------------------"
  "$@"
 }
 case "${1:-help}" in
  smoke)
    # Short correctness-oriented run.  Useful before starting long jobs.
    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      --mps \
      --nqubits 40 \
      --nlayers 10 \
      --bond 2048 \
      --torch-threads "$THREADS" \
      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz range2_xx long_z_string
    ;;
  convergence)
    # Same circuit/observable, increasing bond.  Check value convergence.
    for bond in ${BONDS:-4096 16384 65536}; do
      run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
        --mps \
        --nqubits "${NQ:-80}" \
        --nlayers "${LAYERS:-16}" \
        --bond "$bond" \
        --torch-threads "$THREADS" \
        --circuits "${CIRCUIT:-brickwall_cnot}" \
        --observables "${OBSERVABLE:-ring_xz}"
    done
    ;;
  single-long)
    # Single long Vidal run.  On node-3, a similar n=40,l=30,bond=2048 case
    # took about 9 minutes for one expectation.  This one is meant to be longer.
    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      --mps \
      --nqubits "${NQ:-80}" \
      --nlayers "${LAYERS:-16}" \
      --bond "${BOND:-65536}" \
      --torch-threads "$THREADS" \
      --circuits "${CIRCUIT:-brickwall_cnot}" \
      --observables "${OBSERVABLE:-ring_xz}"
    ;;
  suite-long)
    # Application-style multi-circuit, multi-observable MPS run.
    # This is intentionally multi-term and should run much longer than single-long.
    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      --mps \
      --nqubits "${NQ:-80}" \
      --nlayers "${LAYERS:-16}" \
      --bond "${BOND:-65536}" \
      --torch-threads "$THREADS" \
      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz mixed_local range2_xx long_z_string
    ;;
  mpi-long)
    # Multi-node Vidal segmented MPS run.  Uses HOSTFILE.
    run "$MPIEXEC" -hostfile "$HOSTFILE" -n "$MPI_RANKS" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      --mpi --mps \
      --nqubits "${NQ:-80}" \
      --nlayers "${LAYERS:-16}" \
      --bond "${BOND:-65536}" \
      --torch-threads "$MPI_THREADS" \
      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz mixed_local range2_xx long_z_string
    ;;
  stress)
    # Heavier entanglement.  Start only after single-long is stable.
    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      --mps \
      --nqubits "${NQ:-80}" \
      --nlayers "${LAYERS:-18}" \
      --bond "${BOND:-262144}" \
      --torch-threads "${THREADS:-48}" \
      --circuits "${CIRCUIT:-rxx_rzz}" \
      --observables ring_xz open_zz range2_xx
    ;;
  help|*)
    cat <<'EOF'
 Usage: ./run_vidal_mps_cases.sh [smoke|convergence|single-long|suite-long|mpi-long|stress]
 Common overrides:
  PYTHON_BIN=.venv/bin/python
  THREADS=32
  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
 Single-node scale overrides:
  NQ=80 LAYERS=16 BOND=65536
  CIRCUIT=brickwall_cnot
  OBSERVABLE=ring_xz
  BONDS="4096 16384 65536"   # for convergence mode
 Multi-node overrides:
  HOSTFILE=hostfile
  MPI_RANKS=16 MPI_THREADS=12
 Recommended first runs:
  ./run_vidal_mps_cases.sh smoke
  ./run_vidal_mps_cases.sh convergence
  ./run_vidal_mps_cases.sh single-long
 EOF
    ;;
 esac
--- a/src/qibotn/init.py
+++ b/src/qibotn/init.py
@@ -8,6 +8,108 @@ _LAZY_EXPORTS = {
    "cpu_expectation": ("qibotn.expectation_runner", "cpu_expectation"),
    "mps_expectation": ("qibotn.expectation_runner", "mps_expectation"),
    "cpu_runcard": ("qibotn.expectation_runner", "cpu_runcard"),
    "ExpectationConfig": ("qibotn.expectation_runner", "ExpectationConfig"),
    "exact_for_observable": ("qibotn.expectation_runner", "exact_for_observable"),
    "run_cpu_expectation": ("qibotn.expectation_runner", "run_cpu_expectation"),
    "cpu_benchmark_parallel_opts": (
        "qibotn.expectation_runner",
        "cpu_benchmark_parallel_opts",
    ),
    "run_cpu_benchmark_cases": (
        "qibotn.expectation_runner",
        "run_cpu_benchmark_cases",
    ),
    "build_benchmark_circuit": ("qibotn.benchmark_cases", "build_circuit"),
    "benchmark_observable_terms": ("qibotn.benchmark_cases", "observable_terms"),
    "exact_pauli_sum": ("qibotn.benchmark_cases", "exact_pauli_sum"),
    "ring_xz_statevector_expectation": (
        "qibotn.benchmark_cases",
        "ring_xz_statevector_expectation",
    ),
    "terms_to_dict": ("qibotn.benchmark_cases", "terms_to_dict"),
    "build_contest_circuit": ("qibotn.contest_cases", "build_contest_circuit"),
    "build_contest_observable": (
        "qibotn.contest_cases",
        "build_contest_observable",
    ),
    "contest_cases": ("qibotn.contest_cases", "CASES"),
    "analyze_contraction_tree": ("qibotn.parallel", "analyze_contraction_tree"),
    "load_tree_payload": ("qibotn.parallel", "load_tree_payload"),
    "save_tree_payload": ("qibotn.parallel", "save_tree_payload"),
    "slice_tree_payload": ("qibotn.parallel", "slice_tree_payload"),
    "make_qmatchatea_backend": (
        "qibotn.backends.qmatchatea",
        "make_qmatchatea_backend",
    ),
    "build_qmatchatea_backend": (
        "qibotn.backends.qmatchatea",
        "build_qmatchatea_backend",
    ),
    "benchmark_qmatchatea_svd_control": (
        "qibotn.backends.qmatchatea",
        "benchmark_qmatchatea_svd_control",
    ),
    "run_qmatchatea_expectation": (
        "qibotn.backends.qmatchatea",
        "run_qmatchatea_expectation",
    ),
    "exact_mps_expectation": (
        "qibotn.backends.qmatchatea",
        "exact_mps_expectation",
    ),
    "make_vidal_backend": ("qibotn.backends.vidal", "make_vidal_backend"),
    "compare_vidal_backend_qmatchatea": (
        "qibotn.backends.vidal",
        "compare_vidal_backend_qmatchatea",
    ),
    "run_vidal_expectation": ("qibotn.backends.vidal", "run_vidal_expectation"),
    "run_segmented_vidal_ring_xz": (
        "qibotn.backends.vidal",
        "run_segmented_vidal_ring_xz",
    ),
    "build_expectation_tn": ("qibotn.backends.quimb", "build_expectation_tn"),
    "build_quimb_circuit_stats": (
        "qibotn.backends.quimb",
        "build_quimb_circuit_stats",
    ),
    "compare_quimb_gate_merge": (
        "qibotn.backends.quimb",
        "compare_quimb_gate_merge",
    ),
    "compare_quimb_gate_merge_expectation": (
        "qibotn.backends.quimb",
        "compare_quimb_gate_merge_expectation",
    ),
    "contract_tn": ("qibotn.backends.quimb", "contract_tn"),
    "load_custom_case_module": ("qibotn.backends.quimb", "load_custom_case_module"),
    "profile_quimb_torch_expectation": (
        "qibotn.backends.quimb",
        "profile_quimb_torch_expectation",
    ),
    "qibo_circuit_to_quimb_torch": (
        "qibotn.backends.quimb",
        "qibo_circuit_to_quimb_torch",
    ),
    "search_contraction_tree": ("qibotn.backends.quimb", "search_contraction_tree"),
    "sorted_tree": ("qibotn.backends.quimb", "sorted_tree"),
    "run_contest_tn_case": ("qibotn.backends.quimb", "run_contest_tn_case"),
    "run_custom_tn_expectation": (
        "qibotn.backends.quimb",
        "run_custom_tn_expectation",
    ),
    "time_quimb_contract_implementations": (
        "qibotn.backends.quimb",
        "time_quimb_contract_implementations",
    ),
    "run_contest_mps_case": ("qibotn.backends.vidal", "run_contest_mps_case"),
    "run_vidal_mpi_contest_case": (
        "qibotn.backends.vidal",
        "run_vidal_mpi_contest_case",
    ),
    "run_vidal_validation_cases": (
        "qibotn.backends.vidal",
        "run_vidal_validation_cases",
    ),
    "pauli_pattern": ("qibotn.observables", "pauli_pattern"),
    "pauli_sum": ("qibotn.observables", "pauli_sum"),
 }
--- a/src/qibotn/backends/cpu.py
+++ b/src/qibotn/backends/cpu.py
@@ -18,6 +18,7 @@ from qibotn.backends.vidal import (
    _unsupported_reason,
 )
 from qibotn.observables import check_observable
 from qibotn.torch_utils import arrays_to_backend, torch_cpu_array, torch_dtype
 def _as_bool_or_dict(value, name):
@@ -310,10 +311,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
    def _quimb_backend(self):
        import qibotn.backends.quimb as qmb
-        return qmb.BACKENDS[self.quimb_backend](
+        backend = qmb.BACKENDS[self.quimb_backend](
            quimb_backend=self.quimb_backend,
            contraction_optimizer=self.contraction_optimizer,
        )
        backend.dtype = self.dtype
        return backend
    def _bind_rank_to_numa_domain(self, rank):
        self.numa_domain = _bind_numa_node(rank)
@@ -375,6 +378,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
        dask_close_workers = bool(opts.get("dask_close_workers", False))
        print_stats = bool(opts.get("print_stats", False))
        debug_trials = bool(opts.get("debug_trials", False))
        search_seed = int(opts.get("search_seed", 0))
        merge_1q = opts.get("merge_1q", "auto")
        merge_2q = opts.get("merge_2q", "auto")
        sort_contract_indices = opts.get("sort_contract_indices", "auto")
        if sort_contract_indices == "auto":
            sort_contract_indices = self.quimb_backend == "torch"
        search_only = bool(opts.get("search_only", False))
        save_tree_path = opts.get("save_tree_path")
        load_tree_path = opts.get("load_tree_path")
@@ -382,6 +391,38 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
        saved_trees = []
        saved_costs = []
        def term_stats(
            term_index,
            factors,
            path_cost,
            search_stats,
            tree_slices,
            slice_assignment,
            rank_slices,
            search_seconds,
            contract_seconds,
        ):
            return {
                "term_index": term_index,
                "term_factors": tuple(factors),
                "path_cost": path_cost,
                "search_stats": search_stats,
                "tree_slices": tree_slices,
                "slice_assignment": slice_assignment,
                "rank_slices": rank_slices,
                "search_seconds": search_seconds,
                "contract_seconds": contract_seconds,
                "search_workers": search_workers,
                "search_repeats": search_repeats,
                "search_time": search_time,
                "search_backend": search_backend or method,
                "search_seed": search_seed,
                "merge_1q": merge_1q,
                "merge_2q": merge_2q,
                "dask_address": dask_address,
                "numa_domain": getattr(self, "numa_domain", None),
            }
        if load_tree_path:
            with Path(load_tree_path).open("rb") as f:
                payload = pickle.load(f)
@@ -396,6 +437,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                "max_bond": self.max_bond_dimension,
                "cutoff": self.cut_ratio,
            },
            merge_1q=merge_1q,
            merge_2q=merge_2q,
        )
        total_value = 0.0 + 0.0j
@@ -415,6 +458,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                )
            else:
                op, where = _pauli_term_to_dense_operator(factors)
                if self.quimb_backend == "torch":
                    op = torch_cpu_array(op, dtype=torch_dtype(self.dtype))
                tn = qc.local_expectation(
                    op,
                    where,
@@ -455,10 +500,18 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                    debug_trials=debug_trials,
                    dask_close_workers=dask_close_workers,
                    expected_workers=dask_expected_workers,
                    search_seed=search_seed,
                )
                search_seconds = time.perf_counter() - search_start
            if tree is None:
                raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
            if sort_contract_indices and hasattr(tree, "sort_contraction_indices"):
                tree.sort_contraction_indices(
                    priority=opts.get("sort_contract_indices_priority", "flops"),
                    make_output_contig=True,
                    make_contracted_contig=True,
                    reset=True,
                )
            if self.parallel_opts.get("contract_implementation") == "cpp":
                from qibotn.torch_contractor import prepare_torch_cpp_contractor
@@ -490,23 +543,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
            if search_only:
                self.parallel_stats.append(
-                    {
+                    term_stats(
-                        "term_index": term_index,
+                        term_index,
-                        "term_factors": tuple(factors),
+                        factors,
-                        "path_cost": path_cost,
+                        path_cost,
-                        "search_stats": search_stats,
+                        search_stats,
-                        "tree_slices": int(getattr(tree, "multiplicity", 1)),
+                        int(getattr(tree, "multiplicity", 1)),
-                        "slice_assignment": "search_only",
+                        "search_only",
-                        "rank_slices": [],
+                        [],
-                        "search_seconds": search_seconds,
+                        search_seconds,
-                        "contract_seconds": 0.0,
+                        0.0,
-                        "search_workers": search_workers,
+                    )
                        "search_repeats": search_repeats,
                        "search_time": search_time,
                        "search_backend": search_backend or method,
                        "dask_address": dask_address,
                        "numa_domain": getattr(self, "numa_domain", None),
                    }
                )
                continue
@@ -523,23 +570,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                            flush=True,
                        )
                    self.parallel_stats.append(
-                        {
+                        term_stats(
-                            "term_index": term_index,
+                            term_index,
-                            "term_factors": tuple(factors),
+                            factors,
-                            "path_cost": path_cost,
+                            path_cost,
-                            "search_stats": search_stats,
+                            search_stats,
-                            "tree_slices": 1,
+                            1,
-                            "slice_assignment": "root",
+                            "root",
-                            "rank_slices": [1] + [0] * (size - 1),
+                            [1] + [0] * (size - 1),
-                            "search_seconds": search_seconds,
+                            search_seconds,
-                            "contract_seconds": contract_seconds,
+                            contract_seconds,
-                            "search_workers": search_workers,
+                        )
                            "search_repeats": search_repeats,
                            "search_time": search_time,
                            "search_backend": search_backend or method,
                            "dask_address": dask_address,
                            "numa_domain": getattr(self, "numa_domain", None),
                        }
                    )
                    total_value += coeff * complex(value)
                continue
@@ -556,36 +597,31 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                        flush=True,
                    )
                self.parallel_stats.append(
-                    {
+                    term_stats(
-                        "term_index": term_index,
+                        term_index,
-                        "term_factors": tuple(factors),
+                        factors,
-                        "path_cost": path_cost,
+                        path_cost,
-                        "search_stats": search_stats,
+                        search_stats,
-                        "tree_slices": int(getattr(tree, "multiplicity", 1)),
+                        int(getattr(tree, "multiplicity", 1)),
-                        "slice_assignment": "local",
+                        "local",
-                        "rank_slices": [int(getattr(tree, "multiplicity", 1))],
+                        [int(getattr(tree, "multiplicity", 1))],
-                        "search_seconds": search_seconds,
+                        search_seconds,
-                        "contract_seconds": contract_seconds,
+                        contract_seconds,
-                        "search_workers": search_workers,
+                    )
                        "search_repeats": search_repeats,
                        "search_time": search_time,
                        "search_backend": search_backend or method,
                        "dask_address": dask_address,
                        "numa_domain": getattr(self, "numa_domain", None),
                    }
                )
                total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
                continue
            contract_start = time.perf_counter()
            arrays = self._term_arrays(tn, backend)
            contract_implementation = self._contract_implementation(backend)
            value, stats = parallel_contract(
                tree,
                arrays,
                method="mpi",
                comm=comm,
                return_stats=True,
-                implementation=self.parallel_opts.get("contract_implementation"),
+                implementation=contract_implementation,
            )
            contract_seconds = time.perf_counter() - contract_start
            gathered_stats = comm.gather(stats, root=0)
@@ -598,25 +634,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                        flush=True,
                    )
                self.parallel_stats.append(
-                    {
+                    term_stats(
-                        "term_index": term_index,
+                        term_index,
-                        "term_factors": tuple(factors),
+                        factors,
-                        "path_cost": path_cost,
+                        path_cost,
-                        "search_stats": search_stats,
+                        search_stats,
-                        "tree_slices": stats.nslices,
+                        stats.nslices,
-                        "slice_assignment": stats.assignment,
+                        stats.assignment,
-                        "rank_slices": [
+                        [item.local_slices for item in gathered_stats],
-                            item.local_slices for item in gathered_stats
+                        search_seconds,
-                        ],
+                        contract_seconds,
-                        "search_seconds": search_seconds,
+                    )
                        "contract_seconds": contract_seconds,
                        "search_workers": search_workers,
                        "search_repeats": search_repeats,
                        "search_time": search_time,
                        "search_backend": search_backend or method,
                        "dask_address": dask_address,
                        "numa_domain": getattr(self, "numa_domain", None),
                    }
                )
                total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
@@ -644,18 +672,20 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
        return np.nan if rank != 0 else float(np.real(total_value))
    def _contract_implementation(self, backend):
        implementation = self.parallel_opts.get("contract_implementation")
        if implementation is None and backend.backend == "torch":
            return "autoray"
        return implementation
    def _contract_term_unsliced(self, tn, tree, backend):
-        contract_implementation = self.parallel_opts.get("contract_implementation")
+        contract_implementation = self._contract_implementation(backend)
        if contract_implementation == "cpp":
            if backend.backend != "torch":
                raise ValueError("contract_implementation='cpp' requires torch backend.")
            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
            from qibotn.torch_contractor import contract_tree_cpp
-            arrays = [
+            arrays = arrays_to_backend(tn.arrays, "torch", dtype=self.dtype)
                _torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
                for array in tn.arrays
            ]
            nslices = int(getattr(tree, "multiplicity", 1))
            if nslices > 1:
                total = None
@@ -666,12 +696,10 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
            return contract_tree_cpp(tree, arrays)
        if backend.backend == "torch":
            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
            for tensor in tn.tensors:
-                tensor._data = _torch_cpu_array(
+                tensor._data = torch_cpu_array(
                    tensor._data,
-                    dtype=_torch_dtype(self.dtype),
+                    dtype=torch_dtype(self.dtype),
                )
            return tn.contract(
                all,
@@ -693,13 +721,9 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
        return None if user_slicing_opts is None else dict(user_slicing_opts)
    def _term_arrays(self, tn, backend):
-        if backend.backend == "torch":
+        return arrays_to_backend(
-            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
+            tn.arrays,
-
+            backend.backend,
-            return [
+            engine=backend.engine,
-                _torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
+            dtype=self.dtype,
-                for array in tn.arrays
+        )
            ]
        from qibotn.backends.quimb import _numpy_dtype
        return [backend.engine.asarray(array, dtype=_numpy_dtype(self.dtype)) for array in tn.arrays]
--- a/src/qibotn/backends/cutensornet_helpers.py
+++ b/src/qibotn/backends/cutensornet_helpers.py
@@ -0,0 +1,321 @@
 """cuTensorNet circuit and MPS conversion helpers."""
 from __future__ import annotations
 import numpy as np
 try:
    import cupy as cp
    import cuquantum.bindings.cutensornet as cutn
    from cuquantum.tensornet import contract, contract_path
    from cuquantum.tensornet.experimental import contract_decompose
 except ImportError:  # pragma: no cover - exercised on CPU-only installations
    cp = None
    cutn = None
    contract = None
    contract_path = None
    contract_decompose = None
 def _require_cupy():
    if cp is None:
        raise ImportError(
            "The cuQuantum circuit converter requires cupy. "
            "Install the GPU dependencies or use the CPU backend."
        )
    return cp
 def _require_cutensornet():
    if cp is None or cutn is None:
        raise ImportError(
            "The cuQuantum MPS converter requires cupy and cuquantum. "
            "Install the GPU dependencies or use the CPU backend."
        )
 def _require_tensornet_mps():
    if cp is None or contract is None or contract_decompose is None:
        raise ImportError(
            "The cuQuantum MPS helpers require cupy and cuquantum. "
            "Install the GPU dependencies or use the CPU backend."
        )
 def _require_contract():
    if contract is None or contract_path is None:
        raise ImportError(
            "The cuQuantum MPS contraction helper requires cuquantum. "
            "Install the GPU dependencies or use the CPU backend."
        )
 class QiboCircuitToEinsum:
    """Convert a Qibo circuit to cuQuantum interleaved TN operands."""
    def __init__(self, circuit, dtype="complex128"):
        self.backend = _require_cupy()
        self.dtype = getattr(self.backend, dtype)
        self.init_basis_map(self.backend, dtype)
        self.init_intermediate_circuit(circuit)
        self.circuit = circuit
    def state_vector_operands(self):
        input_bitstring = "0" * len(self.active_qubits)
        input_operands = self._get_bitstring_tensors(input_bitstring)
        mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits(
            self.active_qubits
        )
        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
            self.gate_tensors, qubits_frontier, next_frontier
        )
        operands = input_operands + gate_operands
        mode_labels += gate_mode_labels
        out_list = [qubits_frontier[key] for key in qubits_frontier]
        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
        operand_exp_interleave.append(out_list)
        return operand_exp_interleave
    def _init_mode_labels_from_qubits(self, qubits):
        nqubits = len(qubits)
        frontier_dict = {q: i for i, q in enumerate(qubits)}
        mode_labels = [[i] for i in range(nqubits)]
        return mode_labels, frontier_dict, nqubits
    def _get_bitstring_tensors(self, bitstring):
        return [self.basis_map[ibit] for ibit in bitstring]
    def _parse_gates_to_mode_labels_operands(self, gates, qubits_frontier, next_frontier):
        mode_labels = []
        operands = []
        for tensor, gate_qubits in gates:
            operands.append(tensor)
            input_mode_labels = []
            output_mode_labels = []
            for qubit in gate_qubits:
                input_mode_labels.append(qubits_frontier[qubit])
                output_mode_labels.append(next_frontier)
                qubits_frontier[qubit] = next_frontier
                next_frontier += 1
            mode_labels.append(output_mode_labels + input_mode_labels)
        return mode_labels, operands
    def op_shape_from_qubits(self, nqubits):
        return (2, 2) * nqubits
    def init_intermediate_circuit(self, circuit):
        self.gate_tensors = []
        gates_qubits = []
        for gate in circuit.queue:
            gate_qubits = gate.control_qubits + gate.target_qubits
            gates_qubits.extend(gate_qubits)
            required_shape = self.op_shape_from_qubits(len(gate_qubits))
            self.gate_tensors.append(
                (
                    self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
                        required_shape
                    ),
                    gate_qubits,
                )
            )
        self.active_qubits = np.unique(gates_qubits)
    def init_basis_map(self, backend, dtype):
        asarray = backend.asarray
        self.basis_map = {
            "0": asarray([1, 0], dtype=dtype),
            "1": asarray([0, 1], dtype=dtype),
        }
    def init_inverse_circuit(self, circuit):
        self.gate_tensors_inverse = []
        gates_qubits_inverse = []
        for gate in circuit.queue:
            gate_qubits = gate.control_qubits + gate.target_qubits
            gates_qubits_inverse.extend(gate_qubits)
            required_shape = self.op_shape_from_qubits(len(gate_qubits))
            self.gate_tensors_inverse.append(
                (self.backend.asarray(gate.matrix()).reshape(required_shape), gate_qubits)
            )
        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
        if backend is None:
            backend = _require_cupy()
        asarray = backend.asarray
        operand_map = {
            "I": asarray([[1, 0], [0, 1]], dtype=dtype),
            "X": asarray([[0, 1], [1, 0]], dtype=dtype),
            "Y": asarray([[0, -1j], [1j, 0]], dtype=dtype),
            "Z": asarray([[1, 0], [0, -1]], dtype=dtype),
        }
        gates = []
        for qubit, pauli_char in pauli_map.items():
            operand = operand_map.get(pauli_char)
            if operand is None:
                raise ValueError("pauli string character must be one of I/X/Y/Z")
            gates.append((operand, (qubit,)))
        return gates
    def expectation_operands(self, ham_gates):
        input_bitstring = "0" * self.circuit.nqubits
        input_operands = self._get_bitstring_tensors(input_bitstring)
        mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits(
            range(self.circuit.nqubits)
        )
        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
            self.gate_tensors, qubits_frontier, next_frontier
        )
        operands = input_operands + gate_operands
        mode_labels += gate_mode_labels
        self.init_inverse_circuit(self.circuit.invert())
        next_frontier = max(qubits_frontier.values()) + 1
        gates_inverse = ham_gates + self.gate_tensors_inverse
        gate_mode_labels_inverse, gate_operands_inverse = (
            self._parse_gates_to_mode_labels_operands(
                gates_inverse, qubits_frontier, next_frontier
            )
        )
        mode_labels = (
            mode_labels
            + gate_mode_labels_inverse
            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
        )
        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
        operand_exp_interleave.append([])
        return operand_exp_interleave
 def initial_mps(num_qubits, dtype):
    _require_tensornet_mps()
    state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
    return [state_tensor] * num_qubits
 def mps_site_right_swap(mps_tensors, i, **kwargs):
    _require_tensornet_mps()
    left, _, right = contract_decompose(
        "ipj,jqk->iqj,jpk",
        *mps_tensors[i : i + 2],
        algorithm=kwargs.get("algorithm", None),
        options=kwargs.get("options", None),
    )
    mps_tensors[i : i + 2] = (left, right)
    return mps_tensors
 def apply_mps_gate(mps_tensors, gate, qubits, **kwargs):
    _require_tensornet_mps()
    n_qubits = len(qubits)
    if n_qubits == 1:
        site = qubits[0]
        mps_tensors[site] = contract(
            "ipj,qp->iqj",
            mps_tensors[site],
            gate,
            options=kwargs.get("options", None),
        )
    elif n_qubits == 2:
        left, right = qubits
        if left > right:
            return apply_mps_gate(
                mps_tensors, gate.transpose(1, 0, 3, 2), (right, left), **kwargs
            )
        if left + 1 == right:
            a_tensor, _, b_tensor = contract_decompose(
                "ipj,jqk,rspq->irj,jsk",
                *mps_tensors[left : left + 2],
                gate,
                algorithm=kwargs.get("algorithm", None),
                options=kwargs.get("options", None),
            )
            mps_tensors[left : left + 2] = (a_tensor, b_tensor)
        else:
            mps_site_right_swap(mps_tensors, left, **kwargs)
            apply_mps_gate(mps_tensors, gate, (left + 1, right), **kwargs)
            mps_site_right_swap(mps_tensors, left, **kwargs)
    else:
        raise NotImplementedError("Only one- and two-qubit gates supported")
 class QiboCircuitToMPS:
    """Convert a Qibo circuit to a cuTensorNet MPS representation."""
    def __init__(self, circ_qibo, gate_algo, dtype="complex128", rand_seed=0):
        _require_cutensornet()
        np.random.seed(rand_seed)
        cp.random.seed(rand_seed)
        self.num_qubits = circ_qibo.nqubits
        self.handle = cutn.create()
        self.dtype = dtype
        self.mps_tensors = initial_mps(self.num_qubits, dtype=dtype)
        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
        for gate, qubits in circuitconvertor.gate_tensors:
            apply_mps_gate(
                self.mps_tensors,
                gate,
                qubits,
                algorithm=gate_algo,
                options={"handle": self.handle},
            )
    def __del__(self):
        handle = getattr(self, "handle", None)
        if cutn is not None and handle is not None:
            cutn.destroy(handle)
 class MPSContractionHelper:
    """Contract cuTensorNet MPS tensors to norms, states, or expectations."""
    def __init__(self, num_qubits):
        self.num_qubits = num_qubits
        self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)]
        offset = 2 * num_qubits + 1
        self.ket_modes = [
            (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits)
        ]
    def contract_norm(self, mps_tensors, options=None):
        interleaved_inputs = []
        for i, tensor in enumerate(mps_tensors):
            interleaved_inputs.extend(
                [tensor, self.bra_modes[i], tensor.conj(), self.ket_modes[i]]
            )
        interleaved_inputs.append([])
        return self._contract(interleaved_inputs, options=options).real
    def contract_state_vector(self, mps_tensors, options=None):
        interleaved_inputs = []
        for i, tensor in enumerate(mps_tensors):
            interleaved_inputs.extend([tensor, self.bra_modes[i]])
        output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes])
        interleaved_inputs.append(output_modes)
        return self._contract(interleaved_inputs, options=options)
    def contract_expectation(
        self, mps_tensors, operator, qubits, options=None, normalize=False
    ):
        interleaved_inputs = []
        extra_mode = 3 * self.num_qubits + 2
        operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits]
        qubits = list(qubits)
        for i, tensor in enumerate(mps_tensors):
            interleaved_inputs.extend([tensor, self.bra_modes[i]])
            ket_modes = self.ket_modes[i]
            if i in qubits:
                ket_modes = (ket_modes[0], extra_mode, ket_modes[2])
                operator_modes[qubits.index(i)] = extra_mode
                extra_mode += 1
            interleaved_inputs.extend([tensor.conj(), ket_modes])
        interleaved_inputs.extend([operator, tuple(operator_modes)])
        interleaved_inputs.append([])
        norm = self.contract_norm(mps_tensors, options=options) if normalize else 1
        return self._contract(interleaved_inputs, options=options) / norm
    def _contract(self, interleaved_inputs, options=None):
        _require_contract()
        path = contract_path(*interleaved_inputs, options=options)[0]
        return contract(*interleaved_inputs, options=options, optimize={"path": path})
--- a/src/qibotn/backends/qmatchatea.py
+++ b/src/qibotn/backends/qmatchatea.py
@@ -1,6 +1,9 @@
 """Implementation of Quantum Matcha Tea backend."""
 from __future__ import annotations
 import re
 import time
 from dataclasses import dataclass
 import numpy as np
@@ -12,6 +15,7 @@ from qibo.config import raise_error
 from qmatchatea.utils import MPISettings
 from qibotn.backends.abstract import QibotnBackend
 from qibotn.benchmark_cases import exact_pauli_sum
 from qibotn.observables import check_observable
 from qibotn.result import TensorNetworkResult
@@ -364,3 +368,207 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            use_itpo=False,
        )
        return obs_sum
@dataclass(frozen=True)
 class QMatchaTeaExpectationResult:
    value: float
    seconds: float
    backend: object
@dataclass(frozen=True)
 class QMatchaTeaBuildResult:
    backend: object
    build_seconds: float
@dataclass(frozen=True)
 class QMatchaTeaSvdControlResult:
    ctrl: str
    contract_singvals: str
    status: str
    median_ms: float
    min_ms: float
    rel_error: float | None
    kept: int | None
    error: str
 def make_qmatchatea_backend(
    *,
    bond=10,
    cut_ratio=1e-9,
    tensor_module="torch",
    svd_control="E!",
    compile_circuit=True,
    track_memory=False,
    mpi_approach="SR",
    mpi_num_procs=1,
    mpi_where_barriers=-1,
    mpi_isometrization=-1,
 ):
    backend = QMatchaTeaBackend()
    backend.configure_tn_simulation(
        ansatz="MPS",
        max_bond_dimension=bond,
        cut_ratio=cut_ratio,
        svd_control=svd_control,
        tensor_module=tensor_module,
        compile_circuit=compile_circuit,
        track_memory=track_memory,
        mpi_approach=mpi_approach,
        mpi_num_procs=mpi_num_procs,
        mpi_where_barriers=mpi_where_barriers,
        mpi_isometrization=mpi_isometrization,
    )
    return backend
 def build_qmatchatea_backend(
    *,
    bond=10,
    cut_ratio=1e-9,
    tensor_module="torch",
    svd_control="E!",
    compile_circuit=True,
    track_memory=False,
    mpi_approach="SR",
    mpi_num_procs=1,
    mpi_where_barriers=-1,
    mpi_isometrization=-1,
 ):
    start = time.perf_counter()
    backend = make_qmatchatea_backend(
        bond=bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        svd_control=svd_control,
        compile_circuit=compile_circuit,
        track_memory=track_memory,
        mpi_approach=mpi_approach,
        mpi_num_procs=mpi_num_procs,
        mpi_where_barriers=mpi_where_barriers,
        mpi_isometrization=mpi_isometrization,
    )
    return QMatchaTeaBuildResult(backend=backend, build_seconds=time.perf_counter() - start)
 def exact_mps_expectation(circuit, observable, nqubits):
    if isinstance(observable, dict) and "terms" in observable:
        terms = [
            (
                term["coefficient"],
                tuple((name, site) for name, site in term["operators"]),
            )
            for term in observable["terms"]
        ]
        return exact_pauli_sum(circuit, terms, nqubits)
    hamiltonian = check_observable(observable, nqubits)
    return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real)
 def run_qmatchatea_expectation(
    circuit,
    observable,
    *,
    bond=10,
    cut_ratio=1e-9,
    tensor_module="torch",
    svd_control="E!",
    compile_circuit=True,
    preprocess=True,
    track_memory=False,
    mpi_approach="SR",
    mpi_num_procs=1,
    mpi_where_barriers=-1,
    mpi_isometrization=-1,
 ):
    built = build_qmatchatea_backend(
        bond=bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        svd_control=svd_control,
        compile_circuit=compile_circuit,
        track_memory=track_memory,
        mpi_approach=mpi_approach,
        mpi_num_procs=mpi_num_procs,
        mpi_where_barriers=mpi_where_barriers,
        mpi_isometrization=mpi_isometrization,
    )
    start = time.perf_counter()
    value = built.backend.expectation(
        circuit,
        observable,
        preprocess=preprocess,
        compile_circuit=compile_circuit,
    )
    return QMatchaTeaExpectationResult(
        value=float(np.real(value)),
        seconds=time.perf_counter() - start,
        backend=built.backend,
    )
 def benchmark_qmatchatea_svd_control(matrix, *, ctrl, max_bond, contract_singvals, repeats):
    import gc
    import statistics
    import torch
    from qredtea.torchapi import QteaTorchTensor
    conv = qmatchatea.QCConvergenceParameters(
        max_bond_dimension=max_bond,
        cut_ratio=0.0,
        svd_ctrl=ctrl,
    )
    qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
    times = []
    rel_error = None
    kept = None
    status = "ok"
    error = ""
    for i in range(repeats):
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        t0 = time.perf_counter()
        try:
            left, right, singvals, _ = qtensor.split_svd(
                [0],
                [1],
                contract_singvals=contract_singvals,
                conv_params=conv,
            )
        except Exception as exc:  # noqa: BLE001
            status = "error"
            error = repr(exc)
            break
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        times.append(time.perf_counter() - t0)
        if i == repeats - 1:
            left_matrix = left.elem.reshape(matrix.shape[0], -1)
            right_matrix = right.elem.reshape(-1, matrix.shape[1])
            recon = left_matrix @ right_matrix
            rel_error = (
                torch.linalg.vector_norm(matrix - recon)
                / torch.linalg.vector_norm(matrix)
            ).item()
            kept = int(singvals.numel())
    return QMatchaTeaSvdControlResult(
        ctrl=ctrl,
        contract_singvals=contract_singvals,
        status=status,
        median_ms=float("nan") if not times else statistics.median(times) * 1000,
        min_ms=float("nan") if not times else min(times) * 1000,
        rel_error=rel_error,
        kept=kept,
        error=error,
    )
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
--- a/src/qibotn/backends/vidal.py
+++ b/src/qibotn/backends/vidal.py
@@ -9,6 +9,7 @@ usable while the fast path is expanded.
 from __future__ import annotations
 import re
 import time
 from dataclasses import dataclass
 import numpy as np
@@ -475,3 +476,511 @@ class VidalBackend(QibotnBackend, NumpyBackend):
            return_array=return_array,
            **prob_kwargs,
        )
@dataclass(frozen=True)
 class VidalExpectationResult:
    value: float
    seconds: float
    backend: object
@dataclass(frozen=True)
 class VidalBackendComparisonResult:
    circuit: object
    observable: object
    exact: float | None
    qmatchatea: VidalExpectationResult | None
    vidal: VidalExpectationResult
    qmatchatea_error: float | None
    vidal_error: float | None
@dataclass(frozen=True)
 class VidalProfileResult:
    value: float
    trace_path: object
    table_path: object
    table: str
 def make_vidal_backend(
    *,
    bond=10,
    cut_ratio=1e-9,
    tensor_module="torch",
    compile_circuit=False,
    mpi_approach="SR",
    mpi_num_procs=1,
    mpi_where_barriers=-1,
    mpi_isometrization=-1,
    mpi_term_batch_size=None,
    fallback=True,
 ):
    backend = VidalBackend()
    backend.configure_tn_simulation(
        max_bond_dimension=bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        compile_circuit=compile_circuit,
        mpi_approach=mpi_approach,
        mpi_num_procs=mpi_num_procs,
        mpi_where_barriers=mpi_where_barriers,
        mpi_isometrization=mpi_isometrization,
        mpi_term_batch_size=mpi_term_batch_size,
        fallback=fallback,
    )
    return backend
 def run_vidal_expectation(
    circuit,
    observable,
    *,
    bond=10,
    cut_ratio=1e-9,
    tensor_module="torch",
    compile_circuit=False,
    preprocess=True,
    mpi_approach="SR",
    mpi_num_procs=1,
    mpi_where_barriers=-1,
    mpi_isometrization=-1,
    mpi_term_batch_size=None,
    fallback=True,
 ):
    backend = make_vidal_backend(
        bond=bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        compile_circuit=compile_circuit,
        mpi_approach=mpi_approach,
        mpi_num_procs=mpi_num_procs,
        mpi_where_barriers=mpi_where_barriers,
        mpi_isometrization=mpi_isometrization,
        mpi_term_batch_size=mpi_term_batch_size,
        fallback=fallback,
    )
    start = time.perf_counter()
    value = backend.expectation(
        circuit,
        observable,
        preprocess=preprocess,
        compile_circuit=compile_circuit,
    )
    return VidalExpectationResult(
        value=float(np.real(value)),
        seconds=time.perf_counter() - start,
        backend=backend,
    )
 def run_segmented_vidal_ring_xz(
    circuit,
    *,
    max_bond=10,
    cut_ratio=1e-9,
    tensor_module="torch",
    comm,
 ):
    from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
    start = time.perf_counter()
    value, timings = run_segment_vidal_mpi_ring_xz(
        circuit,
        max_bond=max_bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        comm=comm,
    )
    return VidalExpectationResult(
        value=float(np.real(value)),
        seconds=time.perf_counter() - start,
        backend=timings,
    )
 def compare_vidal_backend_qmatchatea(
    circuit,
    observable,
    *,
    bond=512,
    cut_ratio=1e-12,
    tensor_module="torch",
    exact=None,
    skip_qmatchatea=False,
    qmatchatea_compile_circuit=True,
    qmatchatea_svd_control="E!",
    vidal_compile_circuit=True,
    vidal_fallback=True,
 ):
    qmatchatea_result = None
    if not skip_qmatchatea:
        qmatchatea_backend = QMatchaTeaBackend()
        qmatchatea_backend.configure_tn_simulation(
            ansatz="MPS",
            max_bond_dimension=bond,
            cut_ratio=cut_ratio,
            svd_control=qmatchatea_svd_control,
            tensor_module=tensor_module,
            compile_circuit=qmatchatea_compile_circuit,
            track_memory=False,
        )
        start = time.perf_counter()
        qmatchatea_value = qmatchatea_backend.expectation(
            circuit,
            observable,
            preprocess=False,
            compile_circuit=qmatchatea_compile_circuit,
        )
        qmatchatea_result = VidalExpectationResult(
            value=float(np.real(qmatchatea_value)),
            seconds=time.perf_counter() - start,
            backend=qmatchatea_backend,
        )
    vidal_backend = VidalBackend()
    vidal_backend.configure_tn_simulation(
        ansatz="MPS",
        max_bond_dimension=bond,
        cut_ratio=cut_ratio,
        tensor_module=tensor_module,
        compile_circuit=vidal_compile_circuit,
        fallback=vidal_fallback,
    )
    start = time.perf_counter()
    vidal_value = vidal_backend.expectation(
        circuit,
        observable,
        preprocess=False,
        compile_circuit=vidal_compile_circuit,
    )
    vidal_result = VidalExpectationResult(
        value=float(np.real(vidal_value)),
        seconds=time.perf_counter() - start,
        backend=vidal_backend,
    )
    qmatchatea_error = None
    vidal_error = None
    if exact is not None:
        if qmatchatea_result is not None:
            qmatchatea_error = abs(qmatchatea_result.value - exact)
        vidal_error = abs(vidal_result.value - exact)
    return VidalBackendComparisonResult(
        circuit=circuit,
        observable=observable,
        exact=exact,
        qmatchatea=qmatchatea_result,
        vidal=vidal_result,
        qmatchatea_error=qmatchatea_error,
        vidal_error=vidal_error,
    )
 def profile_vidal_expectation(
    circuit,
    observable,
    *,
    bond=512,
    cut_ratio=1e-12,
    torch_threads=32,
    trace_path,
    table_path,
    profile_memory=False,
    rows=60,
 ):
    import torch
    from torch.profiler import ProfilerActivity, profile
    from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
    torch.set_num_threads(torch_threads)
    config = ExpectationConfig(
        ansatz="mps",
        bond=bond,
        cut_ratio=cut_ratio,
        tensor_module="torch",
        torch_threads=torch_threads,
    )
    with profile(
        activities=[ProfilerActivity.CPU],
        record_shapes=profile_memory,
        profile_memory=profile_memory,
        with_stack=profile_memory,
    ) as prof:
        result = run_cpu_expectation(circuit, observable, config)
    table = (
        f"expval={result.value:.16e}\n\n"
        f"# sorted by self_cpu_time_total\n"
        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=rows)}\n\n"
        f"# sorted by cpu_time_total\n"
        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=rows)}\n"
    )
    table_path.parent.mkdir(parents=True, exist_ok=True)
    table_path.write_text(table, encoding="utf-8")
    prof.export_chrome_trace(str(trace_path))
    return VidalProfileResult(
        value=result.value,
        trace_path=trace_path,
        table_path=table_path,
        table=table,
    )
 CONTEST_MPS_BONDS = {"main1": 512, "main2": 1024, "strong": 2048}
 CONTEST_VIDAL_OBSERVABLES = (
    "boundary_ZZ_q1",
    "boundary_ZZ_q2",
    "boundary_ZZ_q3",
    "long_Z_5_sites",
    "mixed_XZYZX",
    "ring_xz",
    "open_zz",
    "range2_xx",
    "complex_iZ0",
    "dense2_mid",
    "dense3_spread",
 )
 def run_contest_mps_case(
    case_name="main1",
    *,
    observables=None,
    obs_filter="",
    nqubits=None,
    nlayers=None,
    bond="case-default",
    cut_ratio=1e-12,
    seed=None,
    torch_threads=8,
    exact=False,
    exact_max_qubits=24,
 ):
    """Run a shared contest-style Vidal/MPS expectation case."""
    from qibotn.contest_cases import CASES, build_contest_circuit, build_contest_observable
    from qibotn.expectation_runner import exact_for_observable
    from qibotn.torch_utils import set_torch_threads
    from mpi4py import MPI
    set_torch_threads(torch_threads)
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    case = CASES[case_name]
    nqubits = case.nqubits if nqubits is None else nqubits
    nlayers = case.nlayers if nlayers is None else nlayers
    seed = case.seed if seed is None else seed
    if bond == "case-default":
        bond = CONTEST_MPS_BONDS.get(case_name, 1024)
    if observables is None:
        observables = tuple(x.strip() for x in obs_filter.split(",") if x.strip()) or case.observables
    circuit = build_contest_circuit(case.circuit_kind, nqubits, nlayers, seed)
    records = []
    for obs_name in observables:
        observable = build_contest_observable(obs_name, nqubits, seed)
        exact_value = None
        if exact and rank == 0:
            if nqubits > exact_max_qubits:
                raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.")
            exact_value = exact_for_observable(circuit, observable, nqubits)
        backend = VidalBackend()
        backend.configure_tn_simulation(
            max_bond_dimension=bond,
            cut_ratio=cut_ratio,
            tensor_module="torch",
            mpi_approach="CT",
            mpi_num_procs=size,
            fallback=False,
        )
        comm.Barrier()
        start = time.perf_counter()
        value = backend.expectation(
            circuit,
            observable,
            preprocess=True,
            compile_circuit=False,
        )
        seconds = time.perf_counter() - start
        if rank == 0:
            records.append(
                {
                    "case": case,
                    "observable": obs_name,
                    "value": value,
                    "seconds": seconds,
                    "exact": exact_value,
                    "abs_error": None if exact_value is None else abs(value - exact_value),
                    "rel_error": (
                        None
                        if exact_value is None
                        else abs(value - exact_value) / max(abs(exact_value), 1e-15)
                    ),
                    "truncation_error": backend.last_truncation_error,
                    "max_truncation_error": backend.last_max_truncation_error,
                }
            )
    return records
 def run_vidal_mpi_contest_case(
    *,
    label,
    kind,
    nqubits,
    nlayers,
    bond,
    cut_ratio,
    seed,
    torch_threads,
    obs_filter="",
 ):
    """Run the direct Vidal MPI contest observable sweep."""
    from qibotn.contest_cases import build_contest_circuit, build_contest_observable
    from qibotn.torch_utils import set_torch_threads
    from mpi4py import MPI
    del label
    set_torch_threads(torch_threads)
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    circuit = build_contest_circuit(kind, nqubits, nlayers, seed)
    names = CONTEST_VIDAL_OBSERVABLES
    if obs_filter:
        wanted = set(obs_filter.split(","))
        names = tuple(name for name in names if name in wanted)
        if not names:
            raise ValueError(f"obs_filter matched no observables: {obs_filter!r}")
    records = []
    for obs_name in names:
        observable = build_contest_observable(obs_name, nqubits, seed)
        backend = VidalBackend()
        backend.configure_tn_simulation(
            max_bond_dimension=bond,
            cut_ratio=cut_ratio,
            tensor_module="torch",
            mpi_approach="CT",
            mpi_num_procs=size,
            fallback=False,
        )
        comm.Barrier()
        start = time.perf_counter()
        value = backend.expectation(
            circuit,
            observable,
            preprocess=True,
            compile_circuit=False,
        )
        seconds = time.perf_counter() - start
        if rank == 0:
            records.append(
                {
                    "observable": obs_name,
                    "value": value,
                    "seconds": seconds,
                    "truncation_error": backend.last_truncation_error,
                    "max_truncation_error": backend.last_max_truncation_error,
                }
            )
    return records
 def build_vidal_validation_circuit(kind, nqubits, nlayers, seed):
    """Build the circuit family used by Vidal correctness checks."""
    from qibotn.benchmark_cases import build_circuit
    aliases = {"brickwall": "brickwall_cnot"}
    return build_circuit(aliases.get(kind, kind), nqubits, nlayers, seed)
 def run_vidal_validation_cases(
    *,
    nqubits=16,
    nlayers=6,
    bond=512,
    seed=42,
    tensor_module="torch",
    torch_threads=32,
    mpi=False,
    circuits=("brickwall", "reversed_cnot", "rx_ry_cz"),
    observables=("ring_xz", "open_zz", "mixed_local"),
 ):
    """Run Vidal/TEBD correctness checks against dense statevector references."""
    from qibotn.benchmark_cases import exact_pauli_sum, observable_terms
    from qibotn.backends.vidal_tebd import VidalTEBDExecutor
    from qibotn.torch_utils import set_torch_threads
    set_torch_threads(torch_threads)
    comm = None
    rank = 0
    if mpi:
        from mpi4py import MPI
        from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
    else:
        SegmentVidalMPIExecutor = None
    records = []
    for circuit_kind in circuits:
        circuit = build_vidal_validation_circuit(circuit_kind, nqubits, nlayers, seed)
        if rank == 0:
            exact_values = {
                obs: exact_pauli_sum(circuit, observable_terms(obs, nqubits), nqubits)
                for obs in observables
            }
        else:
            exact_values = None
        if comm is not None:
            exact_values = comm.bcast(exact_values, root=0)
        for obs_kind in observables:
            terms = observable_terms(obs_kind, nqubits)
            start = time.perf_counter()
            if mpi:
                executor = SegmentVidalMPIExecutor(
                    nqubits=nqubits,
                    max_bond=bond,
                    cut_ratio=1e-12,
                    tensor_module=tensor_module,
                    comm=comm,
                )
                executor.run_circuit(circuit)
                value = executor.expectation_pauli_sum_root(terms)
            else:
                executor = VidalTEBDExecutor(
                    nqubits=nqubits,
                    max_bond=bond,
                    cut_ratio=1e-12,
                    tensor_module=tensor_module,
                )
                executor.run_circuit(circuit)
                value = float(executor.expectation_pauli_sum(terms))
            if rank != 0:
                continue
            seconds = time.perf_counter() - start
            exact = exact_values[obs_kind]
            records.append(
                {
                    "circuit": circuit_kind,
                    "observable": obs_kind,
                    "exact": exact,
                    "value": value,
                    "abs_error": abs(value - exact),
                    "seconds": seconds,
                }
            )
    return records
--- a/src/qibotn/benchmark_cases.py
+++ b/src/qibotn/benchmark_cases.py
@@ -12,6 +12,7 @@ CIRCUITS = (
    "brickwall_cnot",
    "reversed_cnot",
    "shifted_cz",
    "rx_ry_cz",
    "rxx_rzz",
    "swap_scramble",
    "ghz_ladder",
@@ -49,14 +50,14 @@ def build_circuit(kind, nqubits, nlayers, seed):
        for qubit in range(nqubits):
            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-            if kind in ("rxx_rzz", "swap_scramble"):
+            if kind in ("rx_ry_cz", "rxx_rzz", "swap_scramble"):
                circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
        if kind == "brickwall_cnot":
            add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
        elif kind == "reversed_cnot":
            add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
-        elif kind == "shifted_cz":
+        elif kind in ("shifted_cz", "rx_ry_cz"):
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.CZ(qubit, qubit + 1))
        elif kind == "rxx_rzz":
@@ -149,3 +150,22 @@ def exact_pauli_sum(circuit, terms, nqubits):
                raise ValueError(f"Unsupported Pauli {name!r}.")
        value += coeff * np.vdot(state[flipped], phase * state)
    return float(value.real)
 def ring_xz_statevector_expectation(state, nqubits, chunk_size=1 << 20):
    """Compute ``0.5 * sum_i X_i Z_(i+1)`` from a dense state vector."""
    state = np.asarray(state).reshape(-1)
    value = 0.0
    for qubit in range(nqubits):
        next_qubit = (qubit + 1) % nqubits
        x_flip = 1 << (nqubits - 1 - qubit)
        z_shift = nqubits - 1 - next_qubit
        term = 0.0
        for start in range(0, state.size, chunk_size):
            stop = min(start + chunk_size, state.size)
            indices = np.arange(start, stop, dtype=np.int64)
            z_bit = (indices >> z_shift) & 1
            z_phase = 1 - 2 * z_bit
            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
        value += 0.5 * term
    return float(value)
--- a/src/qibotn/circuit_convertor.py
+++ b/src/qibotn/circuit_convertor.py
@@ -1,263 +0,0 @@
 import numpy as np
 try:
    import cupy as cp
 except ImportError:  # pragma: no cover - exercised on CPU-only installations
    cp = None
 def _require_cupy():
    if cp is None:
        raise ImportError(
            "The cuQuantum circuit converter requires cupy. "
            "Install the GPU dependencies or use the CPU backend."
        )
    return cp
 # Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
 class QiboCircuitToEinsum:
    """Convert a circuit to a Tensor Network (TN) representation.
    The circuit is first processed to an intermediate form by grouping each gate matrix
    with its corresponding qubit it is acting on to a list. It is then converted to an
    equivalent TN expression through the class function state_vector_operands()
    following the Einstein summation convention in the interleave format.
    See document for detail of the format: https://docs.nvidia.com/cuda/cuquantum/python/api/generated/cuquantum.contract.html
    The output is to be used by cuQuantum's contract() for computation of the
    state vectors of the circuit.
    """
    def __init__(self, circuit, dtype="complex128"):
        self.backend = _require_cupy()
        self.dtype = getattr(self.backend, dtype)
        self.init_basis_map(self.backend, dtype)
        self.init_intermediate_circuit(circuit)
        self.circuit = circuit
    def state_vector_operands(self):
        """Create the operands for dense vector computation in the interleave
        format.
        Returns:
            Operands for the contraction in the interleave format.
        """
        input_bitstring = "0" * len(self.active_qubits)
        input_operands = self._get_bitstring_tensors(input_bitstring)
        (
            mode_labels,
            qubits_frontier,
            next_frontier,
        ) = self._init_mode_labels_from_qubits(self.active_qubits)
        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
            self.gate_tensors, qubits_frontier, next_frontier
        )
        operands = input_operands + gate_operands
        mode_labels += gate_mode_labels
        out_list = []
        for key in qubits_frontier:
            out_list.append(qubits_frontier[key])
        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
        operand_exp_interleave.append(out_list)
        return operand_exp_interleave
    def _init_mode_labels_from_qubits(self, qubits):
        n = len(qubits)
        frontier_dict = {q: i for i, q in enumerate(qubits)}
        mode_labels = [[i] for i in range(n)]
        return mode_labels, frontier_dict, n
    def _get_bitstring_tensors(self, bitstring):
        return [self.basis_map[ibit] for ibit in bitstring]
    def _parse_gates_to_mode_labels_operands(
        self, gates, qubits_frontier, next_frontier
    ):
        mode_labels = []
        operands = []
        for tensor, gate_qubits in gates:
            operands.append(tensor)
            input_mode_labels = []
            output_mode_labels = []
            for q in gate_qubits:
                input_mode_labels.append(qubits_frontier[q])
                output_mode_labels.append(next_frontier)
                qubits_frontier[q] = next_frontier
                next_frontier += 1
            mode_labels.append(output_mode_labels + input_mode_labels)
        return mode_labels, operands
    def op_shape_from_qubits(self, nqubits):
        """Modify tensor to cuQuantum shape.
        Parameters:
            nqubits (int): The number of qubits in quantum circuit.
        Returns:
            (qubit_states,input_output) * nqubits
        """
        return (2, 2) * nqubits
    def init_intermediate_circuit(self, circuit):
        """Initialize the intermediate circuit representation.
        This method initializes the intermediate circuit representation by extracting gate matrices and qubit IDs
        from the given quantum circuit.
        Parameters:
            circuit (object): The quantum circuit object.
        """
        self.gate_tensors = []
        gates_qubits = []
        for gate in circuit.queue:
            gate_qubits = gate.control_qubits + gate.target_qubits
            gates_qubits.extend(gate_qubits)
            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
            required_shape = self.op_shape_from_qubits(len(gate_qubits))
            self.gate_tensors.append(
                (
                    self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
                        required_shape
                    ),
                    gate_qubits,
                )
            )
        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
        self.active_qubits = np.unique(gates_qubits)
    def init_basis_map(self, backend, dtype):
        """Initialize the basis map for the quantum circuit.
        This method initializes a basis map for the quantum circuit, which maps binary
        strings representing qubit states to their corresponding quantum state vectors.
        Parameters:
            backend (object): The backend object providing the array conversion method.
            dtype (object): The data type for the quantum state vectors.
        """
        asarray = backend.asarray
        state_0 = asarray([1, 0], dtype=dtype)
        state_1 = asarray([0, 1], dtype=dtype)
        self.basis_map = {"0": state_0, "1": state_1}
    def init_inverse_circuit(self, circuit):
        """Initialize the inverse circuit representation.
        This method initializes the inverse circuit representation by extracting gate matrices and qubit IDs
        from the given quantum circuit.
        Parameters:
            circuit (object): The quantum circuit object.
        """
        self.gate_tensors_inverse = []
        gates_qubits_inverse = []
        for gate in circuit.queue:
            gate_qubits = gate.control_qubits + gate.target_qubits
            gates_qubits_inverse.extend(gate_qubits)
            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
            required_shape = self.op_shape_from_qubits(len(gate_qubits))
            self.gate_tensors_inverse.append(
                (
                    self.backend.asarray(gate.matrix()).reshape(required_shape),
                    gate_qubits,
                )
            )
        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
        """Populate the gates for all pauli operators.
        Parameters:
            pauli_map: A dictionary mapping qubits to pauli operators.
            dtype: Data type for the tensor operands.
            backend: The package the tensor operands belong to.
        Returns:
            A sequence of pauli gates.
        """
        if backend is None:
            backend = _require_cupy()
        asarray = backend.asarray
        pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
        pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
        pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype)
        pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype)
        operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z}
        gates = []
        for qubit, pauli_char in pauli_map.items():
            operand = operand_map.get(pauli_char)
            if operand is None:
                raise ValueError("pauli string character must be one of I/X/Y/Z")
            gates.append((operand, (qubit,)))
        return gates
    def expectation_operands(self, ham_gates):
        """Create the operands for pauli string expectation computation in the
        interleave format.
        Parameters:
            ham_gates: A list of gates derived from Qibo hamiltonian object.
        Returns:
            Operands for the contraction in the interleave format.
        """
        input_bitstring = "0" * self.circuit.nqubits
        input_operands = self._get_bitstring_tensors(input_bitstring)
        (
            mode_labels,
            qubits_frontier,
            next_frontier,
        ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits))
        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
            self.gate_tensors, qubits_frontier, next_frontier
        )
        operands = input_operands + gate_operands
        mode_labels += gate_mode_labels
        self.init_inverse_circuit(self.circuit.invert())
        next_frontier = max(qubits_frontier.values()) + 1
        gates_inverse = ham_gates + self.gate_tensors_inverse
        (
            gate_mode_labels_inverse,
            gate_operands_inverse,
        ) = self._parse_gates_to_mode_labels_operands(
            gates_inverse, qubits_frontier, next_frontier
        )
        mode_labels = (
            mode_labels
            + gate_mode_labels_inverse
            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
        )
        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
        return operand_exp_interleave
--- a/src/qibotn/circuit_to_mps.py
+++ b/src/qibotn/circuit_to_mps.py
@@ -1,63 +0,0 @@
 import numpy as np
 from qibotn.circuit_convertor import QiboCircuitToEinsum
 from qibotn.mps_utils import apply_gate, initial
 try:
    import cupy as cp
    import cuquantum.bindings.cutensornet as cutn
 except ImportError:  # pragma: no cover - exercised on CPU-only installations
    cp = None
    cutn = None
 def _require_cuquantum():
    if cp is None or cutn is None:
        raise ImportError(
            "The cuQuantum MPS converter requires cupy and cuquantum. "
            "Install the GPU dependencies or use the CPU backend."
        )
 class QiboCircuitToMPS:
    """A helper class to convert Qibo circuit to MPS.
    Parameters:
        circ_qibo: The quantum circuit object.
        gate_algo(dict): Dictionary for SVD and QR settings.
        datatype (str): Either single ("complex64") or double (complex128) precision.
        rand_seed(int): Seed for random number generator.
    """
    def __init__(
        self,
        circ_qibo,
        gate_algo,
        dtype="complex128",
        rand_seed=0,
    ):
        _require_cuquantum()
        np.random.seed(rand_seed)
        cp.random.seed(rand_seed)
        self.num_qubits = circ_qibo.nqubits
        self.handle = cutn.create()
        self.dtype = dtype
        self.mps_tensors = initial(self.num_qubits, dtype=dtype)
        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
        for gate, qubits in circuitconvertor.gate_tensors:
            # mapping from qubits to qubit indices
            # apply the gate in-place
            apply_gate(
                self.mps_tensors,
                gate,
                qubits,
                algorithm=gate_algo,
                options={"handle": self.handle},
            )
    def __del__(self):
        handle = getattr(self, "handle", None)
        if cutn is not None and handle is not None:
            cutn.destroy(handle)
--- a/src/qibotn/contest_cases.py
+++ b/src/qibotn/contest_cases.py
@@ -0,0 +1,241 @@
 """Shared contest-style circuits and observables for qibotn tools."""
 from __future__ import annotations
 import math
 from dataclasses import dataclass
 from pathlib import Path
 import numpy as np
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Y, Z
 from qibotn.backends.quimb import quimb_torch_parallel_opts
@dataclass(frozen=True)
 class CaseSpec:
    circuit_kind: str
    observables: tuple[str, ...]
    nqubits: int
    nlayers: int
    seed: int
    target_slices: int | None = None
 CASES = {
    "main1": CaseSpec(
        circuit_kind="rxx_rzz_chain",
        observables=("ring_xz",),
        nqubits=37,
        nlayers=20,
        seed=31001,
        target_slices=None,
    ),
    "main2": CaseSpec(
        circuit_kind="scramble_chain",
        observables=("open_zz", "range2_xx"),
        nqubits=36,
        nlayers=18,
        seed=31002,
        target_slices=None,
    ),
    "strong": CaseSpec(
        circuit_kind="reversed_cnot",
        observables=("ring_xz", "long_z_string"),
        nqubits=40,
        nlayers=24,
        seed=41001,
        target_slices=None,
    ),
 }
 def _add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
    for qubit in range(nqubits):
        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
        if include_rx:
            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
 def _add_brickwall(circuit, nqubits, gate, layer, reverse=False):
    for qubit in range(0, nqubits - 1, 2):
        if reverse and layer % 2:
            circuit.add(gate(qubit + 1, qubit))
        else:
            circuit.add(gate(qubit, qubit + 1))
    for qubit in range(1, nqubits - 1, 2):
        if reverse and not layer % 2:
            circuit.add(gate(qubit + 1, qubit))
        else:
            circuit.add(gate(qubit, qubit + 1))
 def build_contest_circuit(kind, nqubits, nlayers, seed):
    """Build one of the contest-style benchmark circuits."""
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    if kind == "ghz_ladder":
        circuit.add(gates.H(0))
        for qubit in range(nqubits - 1):
            circuit.add(gates.CNOT(qubit, qubit + 1))
        return circuit
    for layer in range(nlayers):
        if kind in {"brickwall_cnot", "reversed_cnot", "shifted_cz"}:
            _add_single_qubit_layer(circuit, nqubits, rng)
        elif kind in {"rxx_rzz", "swap_scramble"}:
            _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
        elif kind in {"rxx_rzz_chain", "scramble_chain", "scramble"}:
            _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
        if kind == "brickwall_cnot":
            _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
        elif kind == "reversed_cnot":
            _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
        elif kind == "shifted_cz":
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.CZ(qubit, qubit + 1))
        elif kind == "rxx_rzz":
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
        elif kind == "swap_scramble":
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.CZ(qubit, qubit + 1))
                if layer % 4 == 3:
                    circuit.add(gates.SWAP(qubit, qubit + 1))
        elif kind == "rxx_rzz_chain":
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
        elif kind == "scramble_chain":
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                if layer % 5 == 4:
                    circuit.add(gates.SWAP(qubit, qubit + 1))
        elif kind == "scramble":
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                if layer % 5 == 4:
                    circuit.add(gates.SWAP(qubit, qubit + 1))
    return circuit
 def _dense_observable(nqubits, qubits, seed, dim):
    del nqubits
    rng = np.random.default_rng(seed)
    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
    matrix = (raw + raw.conj().T) / 2.0
    matrix = matrix / np.linalg.norm(matrix)
    return {"matrix": matrix, "qubits": list(qubits)}
 def build_contest_observable(kind, nqubits, seed=0):
    """Build one of the shared contest observables."""
    q1 = nqubits // 4
    q2 = nqubits // 2
    q3 = (3 * nqubits) // 4
    last = nqubits - 1
    if kind == "ring_xz":
        form = 0
        for qubit in range(nqubits):
            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "open_zz":
        form = 0
        for qubit in range(nqubits - 1):
            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "range2_xx":
        form = 0
        for qubit in range(nqubits - 2):
            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "mixed_local":
        form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "long_z_string":
        stride = max(1, nqubits // 16)
        form = None
        for qubit in range(0, nqubits, stride):
            form = Z(qubit) if form is None else form * Z(qubit)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "boundary_ZZ_q1":
        return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
    if kind == "boundary_ZZ_q2":
        return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
    if kind == "boundary_ZZ_q3":
        return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
    if kind == "long_Z_5_sites":
        return hamiltonians.SymbolicHamiltonian(
            form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)
        )
    if kind == "mixed_XZYZX":
        return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
    if kind == "complex_iZ0":
        return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
    if kind == "dense2_mid":
        return _dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
    if kind == "dense3_spread":
        return _dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
    raise ValueError(f"Unknown observable kind {kind!r}.")
 def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices, merge_gates=True):
    slice_label = "auto" if target_slices is None else f"s{target_slices}"
    merge_label = "merge" if merge_gates else "nomerge"
    return (
        Path(tree_dir)
        / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}_{merge_label}.pkl"
    )
 def selected_observables(args, case):
    if args.observables:
        return tuple(args.observables)
    if args.obs_filter:
        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
    return case.observables
 def apply_case_defaults(args):
    case = CASES[args.case]
    if args.nqubits is None:
        args.nqubits = case.nqubits
    if args.nlayers is None:
        args.nlayers = case.nlayers
    if args.seed is None:
        args.seed = case.seed
    if args.tn_target_slices is None:
        args.tn_target_slices = case.target_slices
    args.observables = selected_observables(args, case)
 def build_parallel_opts(args, tree_file=None, search_only=False):
    return quimb_torch_parallel_opts(
        target_slices=args.tn_target_slices,
        target_size=args.tn_target_size,
        search_workers=args.tn_search_workers,
        torch_threads=args.torch_threads,
        search_repeats=args.tn_search_repeats,
        search_time=args.tn_search_time,
        search_seed=args.tn_search_seed,
        merge_gates=args.merge_gates,
        search_backend=args.tn_search_backend,
        dask_address=args.dask_address,
        dask_expected_workers=args.dask_expected_workers,
        dask_close_workers=args.dask_close_workers,
        debug_trials=args.tn_debug_trials,
        search_only=search_only,
        save_tree_path=str(tree_file) if tree_file is not None else None,
        load_tree_path=str(tree_file) if tree_file is not None else None,
        print_stats=False,
    )
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,8 +1,10 @@
 from mpi4py import MPI
-from qibotn.circuit_convertor import QiboCircuitToEinsum
+from qibotn.backends.cutensornet_helpers import (
-from qibotn.circuit_to_mps import QiboCircuitToMPS
+    MPSContractionHelper,
-from qibotn.mps_contraction_helper import MPSContractionHelper
+    QiboCircuitToEinsum,
    QiboCircuitToMPS,
 )
 from qibotn.observables import (
    build_observable,
    check_observable,
--- a/src/qibotn/expectation_runner.py
+++ b/src/qibotn/expectation_runner.py
@@ -8,7 +8,15 @@ from dataclasses import dataclass
 import numpy as np
 from qibo.backends import construct_backend
-from qibotn.benchmark_cases import exact_pauli_sum
+from qibotn.benchmark_cases import (
    CIRCUITS,
    OBSERVABLES,
    build_circuit,
    exact_pauli_sum,
    observable_terms,
    parse_names,
    terms_to_dict,
 )
 from qibotn.observables import check_observable
@@ -77,6 +85,18 @@ class ExpectationResult:
    parallel_stats: list | None = None
@dataclass
 class BenchmarkExpectationRecord:
    circuit: str
    observable: str
    value: float
    seconds: float
    exact: float | None = None
    abs_error: float | None = None
    rel_error: float | None = None
    parallel_stats: list | None = None
 def _config_from_kwargs(**kwargs):
    fields = ExpectationConfig.__dataclass_fields__
    config_kwargs = {name: kwargs.pop(name) for name in list(kwargs) if name in fields}
@@ -155,3 +175,148 @@ def mps_expectation(circuit, observable=None, *, return_result=False, **kwargs):
        return_result=return_result,
        **kwargs,
    )
 def cpu_benchmark_parallel_opts(
    *,
    target_slices=None,
    target_size=2**32,
    search_workers=None,
    torch_threads=8,
    search_repeats=128,
    search_time=60.0,
    search_backend="dask",
    dask_address=None,
    dask_close_workers=False,
    save_tree_path=None,
    load_tree_path=None,
    search_only=False,
    debug_trials=False,
    contract_implementation=None,
    print_stats=True,
 ):
    """Build parallel TN options for the CPU expectation backend."""
    slicing_opts = {}
    if target_slices is not None:
        slicing_opts["target_slices"] = target_slices
    if target_size is not None:
        slicing_opts["target_size"] = target_size
    opts = {
        "slicing_opts": slicing_opts or None,
        "search_workers": search_workers or torch_threads,
        "max_repeats": search_repeats,
        "max_time": search_time,
        "print_stats": print_stats,
    }
    if search_backend is not None:
        opts["search_backend"] = search_backend
    if dask_address is not None:
        opts["dask_address"] = dask_address
    if save_tree_path is not None:
        opts["save_tree_path"] = save_tree_path
    if load_tree_path is not None:
        opts["load_tree_path"] = load_tree_path
    if search_only:
        opts["search_only"] = True
    if debug_trials:
        opts["debug_trials"] = True
    if contract_implementation is not None:
        opts["contract_implementation"] = contract_implementation
    if dask_close_workers:
        opts["dask_close_workers"] = True
    return opts
 def run_cpu_benchmark_cases(
    *,
    nqubits=40,
    nlayers=30,
    bond=1024,
    cut_ratio=1e-12,
    seed=42,
    torch_threads=8,
    quimb_backend="torch",
    dtype="complex128",
    ansatz="tn",
    mpi=False,
    exact=False,
    exact_max_qubits=24,
    circuits=("brickwall_cnot",),
    observables=("ring_xz",),
    pauli_pattern=None,
    parallel_opts=None,
 ):
    """Run the reusable CPU TN/MPS benchmark cases.
    This is the importable library entrypoint for reusable CPU benchmark cases.
    """
    selected_circuits = parse_names(list(circuits), CIRCUITS, "circuits")
    selected_observables = (
        []
        if pauli_pattern
        else parse_names(list(observables), OBSERVABLES, "observables")
    )
    rank = 0
    if mpi:
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
    config = ExpectationConfig(
        ansatz=ansatz,
        mpi=mpi,
        bond=bond,
        cut_ratio=cut_ratio,
        tensor_module="torch",
        quimb_backend=quimb_backend,
        dtype=dtype,
        torch_threads=torch_threads,
        parallel_opts=parallel_opts or {},
    )
    records = []
    for circuit_kind in selected_circuits:
        circuit = build_circuit(circuit_kind, nqubits, nlayers, seed)
        named_observables = (
            [(f"pattern:{pauli_pattern}", {"pauli_string_pattern": pauli_pattern})]
            if pauli_pattern
            else [
                (obs_kind, terms_to_dict(observable_terms(obs_kind, nqubits)))
                for obs_kind in selected_observables
            ]
        )
        for obs_name, observable in named_observables:
            exact_value = None
            if exact and rank == 0:
                if nqubits > exact_max_qubits:
                    raise ValueError(
                        f"exact reference is limited to {exact_max_qubits} qubits."
                    )
                exact_value = exact_for_observable(circuit, observable, nqubits)
            result = run_cpu_expectation(circuit, observable, config)
            if mpi and result.rank != 0:
                continue
            abs_error = None if exact_value is None else abs(result.value - exact_value)
            rel_error = (
                None
                if exact_value is None
                else abs_error / max(abs(exact_value), 1e-15)
            )
            records.append(
                BenchmarkExpectationRecord(
                    circuit=circuit_kind,
                    observable=obs_name,
                    value=result.value,
                    seconds=result.seconds,
                    exact=exact_value,
                    abs_error=abs_error,
                    rel_error=rel_error,
                    parallel_stats=result.parallel_stats,
                )
            )
    return records
--- a/src/qibotn/mps_contraction_helper.py
+++ b/src/qibotn/mps_contraction_helper.py
@@ -1,131 +0,0 @@
 try:
    from cuquantum.tensornet import contract, contract_path
 except ImportError:  # pragma: no cover - exercised on CPU-only installations
    contract = None
    contract_path = None
 def _require_cuquantum():
    if contract is None or contract_path is None:
        raise ImportError(
            "The cuQuantum MPS contraction helper requires cuquantum. "
            "Install the GPU dependencies or use the CPU backend."
        )
 # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
 class MPSContractionHelper:
    """A helper class to compute various quantities for a given MPS.
    Interleaved format is used to construct the input args for `cuquantum.contract`.
    Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
    The following compute quantities are supported:
        - the norm of the MPS.
        - the equivalent state vector from the MPS.
        - the expectation value for a given operator.
        - the equivalent state vector after multiplying an MPO to an MPS.
    Parameters:
        num_qubits: The number of qubits for the MPS.
    """
    def __init__(self, num_qubits):
        self.num_qubits = num_qubits
        self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)]
        offset = 2 * num_qubits + 1
        self.ket_modes = [
            (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits)
        ]
    def contract_norm(self, mps_tensors, options=None):
        """Contract the corresponding tensor network to form the norm of the
        MPS.
        Parameters:
            mps_tensors: A list of rank-3 ndarray-like tensor objects.
                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
                the physical mode, and then the bonding index to the i+1th tensor.
            options: Specify the contract and decompose options.
        Returns:
            The norm of the MPS.
        """
        interleaved_inputs = []
        for i, o in enumerate(mps_tensors):
            interleaved_inputs.extend(
                [o, self.bra_modes[i], o.conj(), self.ket_modes[i]]
            )
        interleaved_inputs.append([])  # output
        return self._contract(interleaved_inputs, options=options).real
    def contract_state_vector(self, mps_tensors, options=None):
        """Contract the corresponding tensor network to form the state vector
        representation of the MPS.
        Parameters:
            mps_tensors: A list of rank-3 ndarray-like tensor objects.
                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
                the physical mode, and then the bonding index to the i+1th tensor.
            options: Specify the contract and decompose options.
        Returns:
            An ndarray-like object as the state vector.
        """
        interleaved_inputs = []
        for i, o in enumerate(mps_tensors):
            interleaved_inputs.extend([o, self.bra_modes[i]])
        output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes])
        interleaved_inputs.append(output_modes)  # output
        return self._contract(interleaved_inputs, options=options)
    def contract_expectation(
        self, mps_tensors, operator, qubits, options=None, normalize=False
    ):
        """Contract the corresponding tensor network to form the expectation of
        the MPS.
        Parameters:
            mps_tensors: A list of rank-3 ndarray-like tensor objects.
                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
                the physical mode, and then the bonding index to the i+1th tensor.
            operator: A ndarray-like tensor object.
                The modes of the operator are expected to be output qubits followed by input qubits, e.g,
                ``A, B, a, b`` where `a, b` denotes the inputs and `A, B'` denotes the outputs.
            qubits: A sequence of integers specifying the qubits that the operator is acting on.
            options: Specify the contract and decompose options.
            normalize: Whether to scale the expectation value by the normalization factor.
        Returns:
            An ndarray-like object as the state vector.
        """
        interleaved_inputs = []
        extra_mode = 3 * self.num_qubits + 2
        operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits]
        qubits = list(qubits)
        for i, o in enumerate(mps_tensors):
            interleaved_inputs.extend([o, self.bra_modes[i]])
            k_modes = self.ket_modes[i]
            if i in qubits:
                k_modes = (k_modes[0], extra_mode, k_modes[2])
                q = qubits.index(i)
                operator_modes[q] = extra_mode  # output modes
                extra_mode += 1
            interleaved_inputs.extend([o.conj(), k_modes])
        interleaved_inputs.extend([operator, tuple(operator_modes)])
        interleaved_inputs.append([])  # output
        if normalize:
            norm = self.contract_norm(mps_tensors, options=options)
        else:
            norm = 1
        return self._contract(interleaved_inputs, options=options) / norm
    def _contract(self, interleaved_inputs, options=None):
        _require_cuquantum()
        path = contract_path(*interleaved_inputs, options=options)[0]
        return contract(*interleaved_inputs, options=options, optimize={"path": path})
--- a/src/qibotn/mps_utils.py
+++ b/src/qibotn/mps_utils.py
@@ -1,111 +0,0 @@
 try:
    import cupy as cp
    from cuquantum.tensornet import contract
    from cuquantum.tensornet.experimental import contract_decompose
 except ImportError:  # pragma: no cover - exercised on CPU-only installations
    cp = None
    contract = None
    contract_decompose = None
 def _require_cuquantum():
    if cp is None or contract is None or contract_decompose is None:
        raise ImportError(
            "The cuQuantum MPS helpers require cupy and cuquantum. "
            "Install the GPU dependencies or use the CPU backend."
        )
 def initial(num_qubits, dtype):
    r"""Generate the MPS with an initial state of :math:`\ket{00...00}`
    Parameters:
        num_qubits: Number of qubits in the Quantum Circuit.
        dtype: Either single ("complex64") or double (complex128) precision.
    Returns:
        The initial MPS tensors.
    """
    _require_cuquantum()
    state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
    mps_tensors = [state_tensor] * num_qubits
    return mps_tensors
 def mps_site_right_swap(mps_tensors, i, **kwargs):
    """Perform the swap operation between the ith and i+1th MPS tensors.
    Parameters:
        mps_tensors: Tensors representing MPS
        i (int): index of the tensor to swap
    Returns:
        The updated MPS tensors.
    """
    _require_cuquantum()
    # contraction followed by QR decomposition
    a, _, b = contract_decompose(
        "ipj,jqk->iqj,jpk",
        *mps_tensors[i : i + 2],
        algorithm=kwargs.get("algorithm", None),
        options=kwargs.get("options", None),
    )
    mps_tensors[i : i + 2] = (a, b)
    return mps_tensors
 def apply_gate(mps_tensors, gate, qubits, **kwargs):
    """Apply the gate operand to the MPS tensors in-place.
    # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
    Parameters:
        mps_tensors: A list of rank-3 ndarray-like tensor objects.
            The indices of the ith tensor are expected to be the bonding index to the i-1 tensor,
            the physical mode, and then the bonding index to the i+1th tensor.
        gate: A ndarray-like tensor object representing the gate operand.
            The modes of the gate is expected to be output qubits followed by input qubits, e.g,
            ``A, B, a, b`` where ``a, b`` denotes the inputs and ``A, B`` denotes the outputs.
        qubits: A sequence of integers denoting the qubits that the gate is applied onto.
        algorithm: The contract and decompose algorithm to use for gate application.
            Can be either a `dict` or a `ContractDecomposeAlgorithm`.
        options: Specify the contract and decompose options.
    Returns:
        The updated MPS tensors.
    """
    _require_cuquantum()
    n_qubits = len(qubits)
    if n_qubits == 1:
        # single-qubit gate
        i = qubits[0]
        mps_tensors[i] = contract(
            "ipj,qp->iqj", mps_tensors[i], gate, options=kwargs.get("options", None)
        )  # in-place update
    elif n_qubits == 2:
        # two-qubit gate
        i, j = qubits
        if i > j:
            # swap qubits order
            return apply_gate(mps_tensors, gate.transpose(1, 0, 3, 2), (j, i), **kwargs)
        elif i + 1 == j:
            # two adjacent qubits
            a, _, b = contract_decompose(
                "ipj,jqk,rspq->irj,jsk",
                *mps_tensors[i : i + 2],
                gate,
                algorithm=kwargs.get("algorithm", None),
                options=kwargs.get("options", None),
            )
            mps_tensors[i : i + 2] = (a, b)  # in-place update
        else:
            # non-adjacent two-qubit gate
            # step 1: swap i with i+1
            mps_site_right_swap(mps_tensors, i, **kwargs)
            # step 2: apply gate to (i+1, j) pair. This amounts to a recursive swap until the two qubits are adjacent
            apply_gate(mps_tensors, gate, (i + 1, j), **kwargs)
            # step 3: swap back i and i+1
            mps_site_right_swap(mps_tensors, i, **kwargs)
    else:
        raise NotImplementedError("Only one- and two-qubit gates supported")
--- a/src/qibotn/observables.py
+++ b/src/qibotn/observables.py
@@ -35,7 +35,17 @@ def check_observable(observable, circuit_nqubit):
    if isinstance(observable, dict):
        return create_hamiltonian_from_dict(observable, circuit_nqubit)
    if isinstance(observable, hamiltonians.SymbolicHamiltonian):
-        return observable
+        if observable.nqubits == circuit_nqubit:
            return observable
        if observable.nqubits > circuit_nqubit:
            raise ValueError(
                "Observable has more qubits than the circuit: "
                f"{observable.nqubits} > {circuit_nqubit}."
            )
        return hamiltonians.SymbolicHamiltonian(
            form=observable.form,
            nqubits=circuit_nqubit,
        )
    try:
        return hamiltonians.SymbolicHamiltonian(form=observable)
    except Exception as exc:
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
@@ -1,12 +1,16 @@
 """Parallel path search and contraction utilities for tensor networks."""
 import importlib
 import os
 import pickle
 import signal
 import time
-from math import log2, log10
+from collections import Counter, defaultdict
 import numpy as np
 from dataclasses import dataclass
 from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed
 from dataclasses import dataclass
 from math import log2, log10
 from pathlib import Path
 import numpy as np
 try:
    from mpi4py import MPI
@@ -40,6 +44,12 @@ def _optimizer_search_stats(opt):
    }
 def _tree_search_stats(tree):
    if tree is None:
        return {}
    return getattr(tree, "qibotn_search_stats", {}) or {}
 def _attach_search_stats(tree, opt):
    try:
        tree.qibotn_search_stats = _optimizer_search_stats(opt)
@@ -48,6 +58,47 @@ def _attach_search_stats(tree, opt):
    return tree
 def _search_seed_kwargs(optlib, seed):
    if optlib == "random":
        return {"seed": seed}
    if optlib is None:
        return {"sampler_opts": {"seed": seed}}
    return {}
 def _fallback_greedy_tree(tn, output_inds, slicing_opts=None, error=None):
    import cotengra as ctg
    tree = tn.contraction_tree(
        output_inds=output_inds,
        optimize=ctg.GreedyOptimizer(),
    )
    if slicing_opts:
        target_size = slicing_opts.get("target_size")
        target_slices = slicing_opts.get("target_slices")
        if target_size is not None:
            tree.slice_(target_size=target_size)
        elif target_slices is not None:
            tree.slice_(target_slices=target_slices)
    try:
        tree.qibotn_search_stats = {
            "completed_trials": 0,
            "finite_trials": 0,
            "failed_trials": 0,
            "requested_trials": 0,
            "trial_seconds_sum": 0.0,
            "best_score": float("nan"),
            "best_flops": float("nan"),
            "best_write": float("nan"),
            "best_size": float("nan"),
            "fallback": "greedy",
            "fallback_error": repr(error) if error is not None else None,
        }
    except Exception:
        pass
    return tree
 def _dask_worker_slots(client):
    info = client.scheduler_info(n_workers=-1)
    workers = info.get("workers", {})
@@ -218,13 +269,18 @@ def _search_chunk(
    slicing_opts,
    optlib=None,
 ):
-    import random, cotengra as ctg
+    import random
    import cotengra as ctg
    seed = int(seed)
    random.seed(seed)
    np.random.seed(seed % (2**32))
    tn = pickle.loads(tn_bytes)
    kwargs = {}
    if optlib is not None:
        kwargs["optlib"] = optlib
    kwargs.update(_search_seed_kwargs(optlib, seed))
    opt = ctg.HyperOptimizer(
        methods=SEARCH_METHODS,
        max_repeats=repeats,
@@ -266,7 +322,15 @@ def _kill_pool(pool):
    pool.shutdown(wait=False)
-def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=None, trial_timeout=None):
+def _serial_search(
    tn_bytes,
    output_inds,
    repeats,
    seed,
    max_time,
    slicing_opts=None,
    trial_timeout=None,
 ):
    import time
    if trial_timeout is None:
@@ -287,7 +351,13 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=
            break
        timeout = min(trial_timeout, deadline - time.time())
        pool = ProcessPoolExecutor(max_workers=1)
-        fut = pool.submit(_run_single_trial, tn_bytes, output_inds, seed * 10000 + i, slicing_opts)
+        fut = pool.submit(
            _run_single_trial,
            tn_bytes,
            output_inds,
            seed * 10000 + i,
            slicing_opts,
        )
        try:
            cost, tree = fut.result(timeout=timeout)
            if cost < best_cost:
@@ -304,15 +374,30 @@ def _split_repeats(total_repeats, n_workers):
    n_workers = max(1, int(n_workers))
    total_repeats = max(1, int(total_repeats))
    chunk, extra = divmod(total_repeats, n_workers)
-    return [chunk + (1 if i < extra else 0) for i in range(n_workers) if chunk + (1 if i < extra else 0) > 0]
+    return [
        chunk + (1 if i < extra else 0)
        for i in range(n_workers)
        if chunk + (1 if i < extra else 0) > 0
    ]
-def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None):
+def _processpool_search(
    tn,
    output_inds,
    total_repeats,
    n_workers,
    max_time,
    slicing_opts=None,
    trial_timeout=None,
    search_seed=0,
 ):
    tn_bytes = pickle.dumps(tn)
    repeat_chunks = _split_repeats(total_repeats, n_workers)
    pool = ProcessPoolExecutor(max_workers=len(repeat_chunks))
    futures = []
-    for seed, repeats in enumerate(repeat_chunks):
+    errors = []
    for worker_id, repeats in enumerate(repeat_chunks):
        seed = int(search_seed) + worker_id
        futures.append(
            pool.submit(
                _serial_search,
@@ -334,14 +419,34 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, sli
                cost, tree = fut.result()
                if cost < best_cost:
                    best_cost, best_tree = cost, tree
-            except Exception:
+            except Exception as exc:
-                pass
+                errors.append(repr(exc))
    except TimeoutError:
-        pass
+        errors.append("TimeoutError()")
    finally:
        for fut in futures:
            fut.cancel()
        _kill_pool(pool)
    if best_tree is None:
        if errors:
            print(
                "qibotn_search_failed "
                f"backend=processpool errors={errors[:3]} "
                f"num_errors={len(errors)} fallback=greedy",
                flush=True,
            )
        else:
            print(
                "qibotn_search_failed "
                "backend=processpool errors=[] fallback=greedy",
                flush=True,
            )
        return _fallback_greedy_tree(
            tn,
            output_inds,
            slicing_opts=slicing_opts,
            error=errors[:3],
        )
    return best_tree
@@ -357,6 +462,7 @@ def _dask_search(
    debug_trials=False,
    close_workers=False,
    expected_workers=None,
    search_seed=0,
 ):
    """Run one centralized cotengra hyper-optimizer over a dask pool.
@@ -371,8 +477,14 @@ def _dask_search(
            "`pip install distributed` or the package extra that provides it."
        ) from exc
    import random
    import cotengra as ctg
    search_seed = int(search_seed)
    random.seed(search_seed)
    np.random.seed(search_seed % (2**32))
    _patch_cotengra_dask_as_completed()
    _patch_cotengra_dask_submit(debug_trials=debug_trials)
@@ -400,6 +512,7 @@ def _dask_search(
    kwargs = {}
    if optlib is not None:
        kwargs["optlib"] = optlib
    kwargs.update(_search_seed_kwargs(optlib, search_seed))
    retire_workers = []
    try:
@@ -470,10 +583,12 @@ def _mpi_search(
    dask_address=None,
    debug_trials=False,
    dask_close_workers=False,
    search_seed=0,
 ):
    comm = MPI.COMM_WORLD
    rank, size = comm.Get_rank(), comm.Get_size()
    search_backend = search_backend or "processpool"
    search_seed = int(search_seed)
    if search_backend == "dask":
        if not dask_address:
@@ -496,6 +611,7 @@ def _mpi_search(
                    n_workers=n_workers,
                    debug_trials=debug_trials,
                    close_workers=dask_close_workers,
                    search_seed=search_seed,
                )
                payload = ("ok", tree)
            except Exception as exc:
@@ -518,6 +634,7 @@ def _mpi_search(
        max_time,
        slicing_opts,
        trial_timeout,
        search_seed=search_seed + rank * max(1, n_workers or 1),
    )
    local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")
@@ -531,11 +648,22 @@ def _mpi_search(
    return comm.bcast(best_tree, root=0)
-def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
+def parallel_path_search(
-                         max_time=300, n_workers=48, slicing_opts=None,
+    tn,
-                         trial_timeout=None, search_backend=None,
+    output_inds,
-                         dask_address=None, debug_trials=False,
+    method="processpool",
-                         dask_close_workers=False, expected_workers=None):
+    total_repeats=1024,
    max_time=300,
    n_workers=48,
    slicing_opts=None,
    trial_timeout=None,
    search_backend=None,
    dask_address=None,
    debug_trials=False,
    dask_close_workers=False,
    expected_workers=None,
    search_seed=0,
 ):
    """Parallel contraction path search.
    Args:
@@ -546,11 +674,32 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
        slicing_opts: cotengra slicing options for memory control
        trial_timeout: Per-trial timeout (seconds); kills and skips hung trials
    """
-    if method == 'serial':
+    if method == "serial":
        tn_bytes = pickle.dumps(tn)
-        _, tree = _serial_search(tn_bytes, output_inds, total_repeats, 0, max_time, slicing_opts, trial_timeout)
+        try:
            _, tree = _serial_search(
                tn_bytes,
                output_inds,
                total_repeats,
                search_seed,
                max_time,
                slicing_opts,
                trial_timeout,
            )
        except Exception as exc:
            print(
                "qibotn_search_failed "
                f"backend=serial error={exc!r} fallback=greedy",
                flush=True,
            )
            return _fallback_greedy_tree(
                tn,
                output_inds,
                slicing_opts=slicing_opts,
                error=exc,
            )
        return tree
-    elif method == 'mpi':
+    if method == "mpi":
        if not _HAVE_MPI:
            raise ImportError("mpi4py not available")
        return _mpi_search(
@@ -565,10 +714,20 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
            dask_address=dask_address,
            debug_trials=debug_trials,
            dask_close_workers=dask_close_workers,
            search_seed=search_seed,
        )
-    elif method == 'processpool':
+    if method == "processpool":
-        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
+        return _processpool_search(
-    elif method == 'dask':
+            tn,
            output_inds,
            total_repeats,
            n_workers,
            max_time,
            slicing_opts,
            trial_timeout,
            search_seed=search_seed,
        )
    if method == "dask":
        return _dask_search(
            tn,
            output_inds,
@@ -580,9 +739,9 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
            debug_trials=debug_trials,
            close_workers=dask_close_workers,
            expected_workers=expected_workers,
            search_seed=search_seed,
        )
-    else:
+    raise ValueError(f"Unknown method: {method}")
        raise ValueError(f"Unknown method: {method}")
 def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256):
@@ -615,6 +774,171 @@ def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256):
    }
 def load_tree_payload(path, index=0):
    with Path(path).open("rb") as f:
        payload = pickle.load(f)
    trees = payload["trees"] if isinstance(payload, dict) else payload
    if not isinstance(trees, (list, tuple)):
        trees = [trees]
    return payload, trees[index]
 def save_tree_payload(path, payload):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("wb") as f:
        pickle.dump(payload, f)
 def slice_tree_payload(path, output_path, *, term=0, target_slices=2, max_repeats=64, seed=42):
    payload, tree = load_tree_payload(path, index=term)
    original_costs = contraction_tree_costs(tree)
    sliced_tree = tree.slice(
        target_slices=target_slices,
        max_repeats=max_repeats,
        seed=seed,
    )
    sliced_costs = contraction_tree_costs(sliced_tree)
    if isinstance(payload, dict):
        out_payload = dict(payload)
        trees = payload["trees"] if isinstance(payload["trees"], (list, tuple)) else [payload["trees"]]
        new_trees = list(trees)
        new_trees[term] = sliced_tree
        out_payload["trees"] = new_trees
        out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees]
        out_payload["nterms"] = len(new_trees)
    else:
        trees = payload if isinstance(payload, (list, tuple)) else [payload]
        new_trees = list(trees)
        new_trees[term] = sliced_tree
        out_payload = new_trees
    save_tree_payload(output_path, out_payload)
    return TreePayloadSliceResult(
        payload=payload,
        tree=tree,
        sliced_tree=sliced_tree,
        original_costs=original_costs,
        sliced_costs=sliced_costs,
    )
 def _prod(values):
    out = 1
    for value in values:
        out *= int(value)
    return out
 def _broadcast_batch(a_batch, b_batch):
    if a_batch == b_batch:
        return _prod(a_batch)
    if not a_batch:
        return _prod(b_batch)
    if not b_batch:
        return _prod(a_batch)
    ndim = max(len(a_batch), len(b_batch))
    a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch)
    b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch)
    return _prod(max(a, b) for a, b in zip(a_batch, b_batch))
 def analyze_contraction_tree(tree):
    contract_mod = importlib.import_module("cotengra.contract")
    contractions = contract_mod.extract_contractions(tree)
    size_dict = tree.size_dict
    ops = []
    counts = Counter()
    for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions):
        if left is None and right is None:
            counts["preprocess"] += 1
            continue
        left_inds = tree.get_inds(left)
        right_inds = tree.get_inds(right)
        parent_inds = tree.get_inds(parent)
        left_shape = tuple(size_dict[ix] for ix in left_inds)
        right_shape = tuple(size_dict[ix] for ix in right_inds)
        if tdot:
            parsed = contract_mod._parse_tensordot_axes_to_matmul(
                arg,
                left_shape,
                right_shape,
            )
        else:
            parsed = contract_mod._parse_eq_to_batch_matmul(
                arg,
                left_shape,
                right_shape,
            )
        (
            _eq_a,
            _eq_b,
            new_shape_a,
            new_shape_b,
            _new_shape_ab,
            _perm_ab,
            pure_multiplication,
        ) = parsed
        matmul_shape = None
        matmul_flops = 0
        if pure_multiplication:
            kind = "mul"
        else:
            a_shape = tuple(new_shape_a or left_shape)
            b_shape = tuple(new_shape_b or right_shape)
            batch = _broadcast_batch(a_shape[:-2], b_shape[:-2])
            m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1])
            kind = "mm" if batch == 1 else "bmm"
            matmul_shape = (batch, m, k, n)
            matmul_flops = batch * m * k * n
        tree_flops = int(tree.get_flops(parent))
        out_size = int(tree.get_size(parent))
        ops.append(
            ContractionOpInfo(
                index=op_index,
                kind=kind,
                matmul_shape=matmul_shape,
                matmul_flops=matmul_flops,
                tree_flops=tree_flops,
                out_size=out_size,
                left_shape=left_shape,
                right_shape=right_shape,
                left_rank=len(left_inds),
                right_rank=len(right_inds),
                out_rank=len(parent_inds),
                perm=perm,
            )
        )
        counts[kind] += 1
    nslices = int(getattr(tree, "multiplicity", 1))
    per_slice_flops = sum(op.tree_flops for op in ops)
    per_slice_write = sum(op.out_size for op in ops)
    max_out = max((op.out_size for op in ops), default=0)
    dtype_bytes = 16
    return TreeInspectionResult(
        tree=tree,
        contractions=tuple(contractions),
        operations=tuple(ops),
        counts=dict(counts),
        nslices=nslices,
        per_slice_flops=per_slice_flops,
        per_slice_write=per_slice_write,
        max_output_size=max_out,
        all_slice_flops=per_slice_flops * nslices,
        all_slice_write=per_slice_write * nslices,
        dtype_bytes=dtype_bytes,
        max_output_gib=max_out * dtype_bytes / 1024**3,
    )
@dataclass(frozen=True)
 class SlicePlan:
    """Slice ownership for one MPI rank."""
@@ -637,6 +961,49 @@ class SlicedContractStats:
    assignment: str
@dataclass(frozen=True)
 class TreePayloadSliceResult:
    """Result of slicing one tree stored in a tree payload."""
    payload: object
    tree: object
    sliced_tree: object
    original_costs: dict
    sliced_costs: dict
@dataclass(frozen=True)
 class ContractionOpInfo:
    index: int
    kind: str
    matmul_shape: tuple | None
    matmul_flops: int
    tree_flops: int
    out_size: int
    left_shape: tuple
    right_shape: tuple
    left_rank: int
    right_rank: int
    out_rank: int
    perm: object
@dataclass(frozen=True)
 class TreeInspectionResult:
    tree: object
    contractions: tuple
    operations: tuple
    counts: dict
    nslices: int
    per_slice_flops: int
    per_slice_write: int
    max_output_size: int
    all_slice_flops: int
    all_slice_write: int
    dtype_bytes: int
    max_output_gib: float
 def mpi_slice_plan(nslices, rank, size, assignment="block"):
    """Return the contraction slice ids assigned to one MPI rank.
--- a/src/qibotn/torch_utils.py
+++ b/src/qibotn/torch_utils.py
@@ -0,0 +1,90 @@
 """Shared torch helpers for qibotn CPU tensor-network code."""
 from __future__ import annotations
 import numpy as np
 def torch_dtype(dtype):
    """Return the torch dtype used by qibotn complex CPU contractions."""
    import torch
    if dtype in ("complex64", "single", np.complex64):
        return torch.complex64
    return torch.complex128
 def numpy_dtype(dtype):
    """Return the numpy dtype matching qibotn's complex dtype names."""
    if dtype in ("complex64", "single", np.complex64):
        return np.complex64
    return np.complex128
 def torch_cpu_array(data, dtype=None):
    """Convert array-like data to a contiguous CPU torch tensor.
    ``torch.from_numpy`` rejects negative strides and read-only arrays in common
    quimb paths, so this helper normalizes both cases before handing data to
    torch.
    """
    import torch
    if isinstance(data, torch.Tensor):
        tensor = data
    else:
        array = np.asarray(data)
        if any(stride < 0 for stride in array.strides):
            array = np.ascontiguousarray(array)
        elif not array.flags.writeable:
            array = array.copy()
        tensor = torch.from_numpy(array)
    if tensor.device.type != "cpu":
        tensor = tensor.cpu()
    target_dtype = torch_dtype(dtype) if isinstance(dtype, str) else dtype
    if target_dtype is not None and tensor.dtype != target_dtype:
        tensor = tensor.to(target_dtype)
    if not tensor.is_contiguous():
        tensor = tensor.contiguous()
    return tensor
 def arrays_to_torch(arrays, dtype="complex128"):
    """Convert an iterable of arrays to CPU torch tensors."""
    target_dtype = torch_dtype(dtype)
    return [torch_cpu_array(array, dtype=target_dtype) for array in arrays]
 def arrays_to_numpy(arrays, dtype="complex128"):
    """Convert an iterable of arrays to numpy arrays with qibotn dtype names."""
    target_dtype = numpy_dtype(dtype)
    return [np.asarray(array, dtype=target_dtype) for array in arrays]
 def arrays_to_backend(arrays, backend, engine=None, dtype="complex128"):
    """Convert arrays to the backend representation used by quimb/cotengra."""
    if backend == "torch":
        return arrays_to_torch(arrays, dtype=dtype)
    if engine is not None:
        return [engine.asarray(array, dtype=numpy_dtype(dtype)) for array in arrays]
    return arrays_to_numpy(arrays, dtype=dtype)
 def set_torch_threads(nthreads=None, interop_threads=None):
    """Set torch CPU thread counts and return the active intra-op thread count."""
    import torch
    if nthreads is not None:
        torch.set_num_threads(max(1, int(nthreads)))
    if interop_threads is not None:
        try:
            torch.set_num_interop_threads(max(1, int(interop_threads)))
        except RuntimeError:
            pass
    return torch.get_num_threads()
 def is_torch_array(value):
    """Return whether *value* looks like a torch tensor without importing torch."""
    return type(value).__module__.startswith("torch")
--- a/tests/test_cpu_backend.py
+++ b/tests/test_cpu_backend.py
@@ -10,6 +10,11 @@ from qibotn.benchmark_cases import (
    exact_pauli_sum,
 )
 from qibotn import cpu_expectation, mps_expectation, pauli_pattern, pauli_sum
 from qibotn.backends.quimb import (
    build_expectation_tn,
    contract_tn,
    search_contraction_tree,
 )
 def build_circuit(nqubits=6):
@@ -61,6 +66,31 @@ def test_public_cpu_expectation_api_matches_statevector():
    assert math.isclose(value, exact, abs_tol=1e-12)
 def test_public_quimb_torch_pipeline_matches_statevector():
    circuit = build_circuit(nqubits=4)
    observable = hamiltonians.SymbolicHamiltonian(form=X(0) * Z(1))
    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
    built = build_expectation_tn(
        circuit,
        observable,
        dtype="complex128",
        merge_1q=True,
        merge_2q=True,
    )
    search = search_contraction_tree(
        built.tn,
        method="serial",
        total_repeats=1,
        max_time=30,
        n_workers=1,
        search_seed=0,
    )
    value = built.coeff * complex(contract_tn(built.tn, search.tree))
    assert math.isclose(value.real, exact, abs_tol=1e-12)
 def test_public_mps_expectation_api_accepts_pauli_pattern():
    circuit = build_circuit()
    exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
--- a/tools/README.md
+++ b/tools/README.md
@@ -1,19 +0,0 @@
 # Tools
 Auxiliary scripts for profiling, legacy comparisons, and scale probes.
 The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`.
 For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`.
 Files here are intentionally secondary:
 - `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea.
 - `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path.
 - `run_cpu_single_cases.sh`: single-node scale probes.
 - `run_cpu_large_cases.sh`: two-node MPI scale probes.
 - `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI.
 - `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands.
 - `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
 - `qibojit_reference_expectation.py`: state-vector reference helper.
 - `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.
 - `mpi_torch_thread_probe.py`: MPI + torch OpenMP affinity and threading probe.
--- a/tools/baseline_mps_expectation.py
+++ b/tools/baseline_mps_expectation.py
@@ -1,201 +0,0 @@
 """MPS expectation benchmark for qmatchatea and Vidal backends."""
 import argparse
 import json
 import logging
 import os
 import socket
 import time
 import numpy as np
 from qibotn.benchmark_cases import (
    build_circuit as build_benchmark_circuit,
    exact_pauli_sum,
    observable_terms,
    terms_to_dict,
 )
 from qibotn.backends.qmatchatea import QMatchaTeaBackend
 from qibotn.backends.vidal_tebd import run_vidal_ring_xz
 def optional_int(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return int(text)
 def optional_float(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return float(text)
 def format_optional(value, fmt="g"):
    return "None" if value is None else format(value, fmt)
 def build_circuit(nqubits, nlayers, seed):
    return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
 def build_observable(nqubits):
    return terms_to_dict(observable_terms("ring_xz", nqubits))
 def exact_expectation(circuit, nqubits):
    return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=40)
    parser.add_argument("--nlayers", type=int, default=30)
    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=512)
    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
    parser.add_argument("--torch-threads", type=int, default=32)
    parser.add_argument(
        "--executor",
        choices=("qmatchatea", "vidal", "vidal-mpi"),
        default="qmatchatea",
    )
    parser.add_argument("--mpi-ct", action="store_true")
    parser.add_argument("--mpi-barriers", type=int, default=-1)
    parser.add_argument("--mpi-isometrization", type=int, default=-1)
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
    parser.add_argument("--reference-file")
    parser.add_argument(
        "--mpi-rank-map",
        action="store_true",
        help="Print MPI rank, host, pid, and torch thread placement metadata.",
    )
    args = parser.parse_args()
    logging.getLogger("qibo.config").setLevel(logging.ERROR)
    logging.getLogger("qtealeaves").setLevel(logging.ERROR)
    import torch
    torch.set_num_threads(args.torch_threads)
    rank = 0
    size = 1
    if args.mpi_ct:
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
        size = MPI.COMM_WORLD.Get_size()
        if args.mpi_rank_map:
            rank_info = {
                "rank": rank,
                "size": size,
                "host": socket.gethostname(),
                "pid": os.getpid(),
                "torch_threads": args.torch_threads,
                "omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""),
                "mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""),
            }
            rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0)
            if rank == 0:
                print("mpi_rank_map")
                for item in sorted(rank_infos, key=lambda row: row["rank"]):
                    print(
                        "rank={rank} size={size} host={host} pid={pid} "
                        "torch_threads={torch_threads} "
                        "OMP_NUM_THREADS={omp_num_threads} "
                        "MKL_NUM_THREADS={mkl_num_threads}".format(**item)
                    )
    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
    observable = build_observable(args.nqubits)
    exact = None
    if args.reference_file:
        with open(args.reference_file, "r", encoding="utf-8") as f:
            exact = float(json.load(f)["expectation"])
    elif args.exact:
        if args.nqubits > args.exact_max_qubits:
            raise ValueError(
                f"--exact is limited to {args.exact_max_qubits} qubits by default."
            )
        exact = exact_expectation(circuit, args.nqubits)
    if rank == 0:
        if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"):
            mpi_label = f"VidalSegment/{size}"
        else:
            mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
        print(
            f"nqubits={args.nqubits} nlayers={args.nlayers} "
            f"bond={format_optional(args.bond)} "
            f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
            f"tensor_module={args.tensor_module} svd_control=E! "
            f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
        )
        if exact is not None:
            print(f"exact={exact:.16e}")
        print("expval abs_error rel_error seconds")
    start = time.perf_counter()
    timings = None
    if args.executor in ("vidal", "vidal-mpi"):
        if args.executor == "vidal-mpi" and not args.mpi_ct:
            raise ValueError("--executor vidal-mpi requires --mpi-ct.")
        if args.mpi_ct:
            from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
            value, timings = run_segment_vidal_mpi_ring_xz(
                circuit,
                max_bond=args.bond,
                cut_ratio=args.cut_ratio,
                tensor_module=args.tensor_module,
                comm=MPI.COMM_WORLD,
            )
        else:
            value = run_vidal_ring_xz(
                circuit,
                max_bond=args.bond,
                cut_ratio=args.cut_ratio,
                tensor_module=args.tensor_module,
            )
    else:
        backend = QMatchaTeaBackend()
        backend.configure_tn_simulation(
            ansatz="MPS",
            max_bond_dimension=args.bond,
            cut_ratio=args.cut_ratio,
            svd_control="E!",
            tensor_module=args.tensor_module,
            compile_circuit=True,
            track_memory=False,
            mpi_approach="CT" if args.mpi_ct else "SR",
            mpi_num_procs=size,
            mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1,
            mpi_isometrization=args.mpi_isometrization,
        )
        value = backend.expectation(
            circuit,
            observable,
            preprocess=False,
            compile_circuit=True,
        )
    max_timings = None
    if timings:
        max_timings = {
            key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0)
            for key, local_value in timings.items()
        }
    if rank != 0:
        return
    value = float(np.real(value))
    elapsed = time.perf_counter() - start
    abs_error = float("nan") if exact is None else abs(value - exact)
    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
    print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
    if max_timings:
        print("timing_section max_seconds")
        for key, max_value in max_timings.items():
            print(f"{key} {max_value:.6f}")
 if __name__ == "__main__":
    main()
--- a/tools/benchmark_contract_sliced.py
+++ b/tools/benchmark_contract_sliced.py
@@ -1,56 +0,0 @@
 """MPI parallel sliced contraction using pre-sliced tree."""
 import time, pickle, os
 import numpy as np
 from mpi4py import MPI
 NQUBITS, NLAYERS, NCORES = 25, 10, 48
 comm = MPI.COMM_WORLD
 rank, size = comm.Get_rank(), comm.Get_size()
 os.environ['OMP_NUM_THREADS'] = str(NCORES)
 os.environ['MKL_NUM_THREADS'] = str(NCORES)
 import torch
 import qibo, quimb as qu
 from qibotn.observables import build_random_circuit
 torch.set_num_threads(NCORES)
 circuit = build_random_circuit(NQUBITS, NLAYERS)
 qibo.set_backend("qibotn", platform="quimb")
 backend = qibo.get_backend()
 backend.configure_tn_simulation(ansatz="tn")
 qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
 tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
 if rank == 0:
    with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f:
        tree = pickle.load(f)
 else:
    tree = None
 tree = comm.bcast(tree, root=0)
 arrays = [torch.from_numpy(np.asarray(t._data)) for t in tn.tensors]
 n_slices = tree.multiplicity
 if rank == 0:
    print(f"Slices: {n_slices}, Ranks: {size}, "
          f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, "
          f"Threads/rank: {NCORES}, Backend: torch")
 t0 = time.time()
 result = None
 for i in range(rank, n_slices, size):
    val = tree.contract_slice(arrays, i, backend='torch')
    val_np = val.cpu().numpy().reshape(-1)
    result = val_np if result is None else result + val_np
 if result is None:
    result = np.zeros(1, dtype=np.complex128)
 total = np.zeros_like(result) if rank == 0 else None
 comm.Reduce(result, total, root=0)
 if rank == 0:
    print(f"Contract: {time.time() - t0:.4f}s  Expectation: {0.5 * total[0].real:.10f}")
--- a/tools/benchmark_qredtea_svd_controls.py
+++ b/tools/benchmark_qredtea_svd_controls.py
@@ -1,157 +0,0 @@
 #!/usr/bin/env python
 """Benchmark qredtea/qtealeaves SVD control modes.
 This isolates the tensor split used by MPS updates: a rank-2 tensor is split
 with singular values contracted either left or right, then reconstructed to
 measure numerical error and timing.
 """
 from __future__ import annotations
 import argparse
 import gc
 import statistics
 import time
 import torch
 import qmatchatea
 from qredtea.torchapi import QteaTorchTensor
 def _dtype(name: str):
    return {
        "complex64": torch.complex64,
        "complex128": torch.complex128,
        "float64": torch.float64,
        "float32": torch.float32,
    }[name]
 def _random_matrix(shape, dtype, seed):
    gen = torch.Generator(device="cpu")
    gen.manual_seed(seed)
    if dtype.is_complex:
        real_dtype = torch.float32 if dtype == torch.complex64 else torch.float64
        real = torch.randn(shape, dtype=real_dtype, generator=gen)
        imag = torch.randn(shape, dtype=real_dtype, generator=gen)
        return torch.complex(real, imag).to(dtype)
    return torch.randn(shape, dtype=dtype, generator=gen)
 def _sync():
    if torch.cuda.is_available():
        torch.cuda.synchronize()
 def run_one(matrix, ctrl, max_bond, contract_singvals, repeats):
    conv = qmatchatea.QCConvergenceParameters(
        max_bond_dimension=max_bond,
        cut_ratio=0.0,
        svd_ctrl=ctrl,
    )
    qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
    times = []
    rel_error = None
    kept = None
    status = "ok"
    error = ""
    for i in range(repeats):
        gc.collect()
        _sync()
        t0 = time.perf_counter()
        try:
            left, right, singvals, _ = qtensor.split_svd(
                [0],
                [1],
                contract_singvals=contract_singvals,
                conv_params=conv,
            )
        except Exception as exc:  # noqa: BLE001 - benchmark should keep going
            status = "error"
            error = repr(exc)
            break
        _sync()
        times.append(time.perf_counter() - t0)
        if i == repeats - 1:
            left_matrix = left.elem.reshape(matrix.shape[0], -1)
            right_matrix = right.elem.reshape(-1, matrix.shape[1])
            recon = left_matrix @ right_matrix
            rel_error = (
                torch.linalg.vector_norm(matrix - recon)
                / torch.linalg.vector_norm(matrix)
            ).item()
            kept = int(singvals.numel())
    return {
        "ctrl": ctrl,
        "contract_singvals": contract_singvals,
        "status": status,
        "median_ms": float("nan") if not times else statistics.median(times) * 1000,
        "min_ms": float("nan") if not times else min(times) * 1000,
        "rel_error": rel_error,
        "kept": kept,
        "error": error,
    }
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--shapes", nargs="+", default=("256x1024", "1024x256", "512x512"))
    parser.add_argument("--max-bond", type=int, default=128)
    parser.add_argument("--dtype", choices=("complex64", "complex128", "float32", "float64"), default="complex128")
    parser.add_argument("--threads", type=int, default=8)
    parser.add_argument("--repeats", type=int, default=3)
    parser.add_argument(
        "--controls",
        nargs="+",
        default=("A", "D", "V", "R", "E", "E!", "X", "X!"),
    )
    args = parser.parse_args()
    torch.set_num_threads(args.threads)
    dtype = _dtype(args.dtype)
    print(
        "svd_benchmark "
        f"dtype={args.dtype} threads={torch.get_num_threads()} "
        f"max_bond={args.max_bond} repeats={args.repeats}",
        flush=True,
    )
    print(
        "columns shape contract ctrl status median_ms min_ms kept rel_error error",
        flush=True,
    )
    for shape_text in args.shapes:
        m_text, n_text = shape_text.lower().split("x", 1)
        shape = (int(m_text), int(n_text))
        matrix = _random_matrix(shape, dtype, seed=sum(shape))
        for contract_singvals in ("L", "R"):
            for ctrl in args.controls:
                result = run_one(
                    matrix,
                    ctrl=ctrl,
                    max_bond=args.max_bond,
                    contract_singvals=contract_singvals,
                    repeats=args.repeats,
                )
                print(
                    f"row shape={shape_text} "
                    f"contract={contract_singvals} "
                    f"ctrl={ctrl} "
                    f"status={result['status']} "
                    f"median_ms={result['median_ms']:.3f} "
                    f"min_ms={result['min_ms']:.3f} "
                    f"kept={result['kept']} "
                    f"rel_error={result['rel_error']} "
                    f"error={result['error']}",
                    flush=True,
                )
 if __name__ == "__main__":
    main()
--- a/tools/benchmark_search.py
+++ b/tools/benchmark_search.py
@@ -1,34 +0,0 @@
 """Search contraction path and save."""
 import time, os, pickle
 from qibotn.parallel import parallel_path_search
 from qibotn.observables import build_random_circuit
 import qibo, quimb as qu
 from mpi4py import MPI
 NQUBITS, NLAYERS, WORKERS = 20, 10, 96
 comm = MPI.COMM_WORLD
 rank, size = comm.Get_rank(), comm.Get_size()
 method = 'mpi' if size > 1 else 'processpool'
 circuit = build_random_circuit(NQUBITS, NLAYERS)
 qibo.set_backend("qibotn", platform="quimb")
 backend = qibo.get_backend()
 backend.configure_tn_simulation(ansatz="tn")
 qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
 tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
 if rank == 0:
    print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...")
 t0 = time.time()
 tree = parallel_path_search(tn, tn.outer_inds(), method=method,
    total_repeats=1024, max_time=300, n_workers=WORKERS,trial_timeout=60)
 t_search = time.time() - t0
 if rank == 0:
    os.makedirs('data', exist_ok=True)
    path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl"
    with open(path, 'wb') as f:
        pickle.dump(tree, f)
    print(f"Search: {t_search:.2f}s  Peak: {tree.max_size() * 16 / 1e9:.2f} GB  Saved: {path}")
--- a/tools/benchmark_slice.py
+++ b/tools/benchmark_slice.py
@@ -1,16 +0,0 @@
 """Slice saved tree and save."""
 import pickle
 NQUBITS, NLAYERS = 25, 10
 with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
    tree = pickle.load(f)
 print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB")
 tree_sliced = tree.slice_and_reconfigure(target_size=2**28)
 with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f:
    pickle.dump(tree_sliced, f)
 print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB  Slices: {tree_sliced.multiplicity}")
--- a/tools/benchmark_tn_mpi.py
+++ b/tools/benchmark_tn_mpi.py
@@ -1,378 +0,0 @@
 """MPI-parallel TN benchmark: path search + contraction via MPI."""
 import json
 import pickle
 import time
 import argparse
 import numpy as np
 import cotengra as ctg
 import qibo
 from qibo import Circuit, gates
 from mpi4py import MPI
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from qibotn.observables import check_observable, extract_gates_and_qubits
 def _load_observable(observable_file=None, observable_json=None):
    if observable_file:
        with open(observable_file, "r", encoding="utf8") as f:
            return json.load(f)
    if observable_json:
        return json.loads(observable_json)
    return None
 def _term_to_quimb_operator(term):
    """Convert one extracted Hamiltonian term to a quimb operator."""
    import quimb as qu
    coeff = complex(term[0][2]) if term else 1.0
    op = None
    where = []
    for qubit, gate_name, _ in term:
        qubit = int(qubit)
        gate_name = str(gate_name).upper()
        if gate_name == "I":
            continue
        where.append(qubit)
        op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower())
    return complex(coeff), op, tuple(where)
 def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time):
    import pickle, cotengra as ctg, random
    random.seed(seed)
    tn = pickle.loads(tn_bytes)
    opt = ctg.HyperOptimizer(
        methods=['kahypar', 'kahypar-agglom', 'spinglass'],
        max_repeats=repeats,
        parallel=False,
        minimize='combo-256',
        max_time=max_time,
        optlib="random",
        slicing_opts={'target_size': 2**29, 'allow_outer': True},
        progbar=False,
    )
    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
    return tree.combo_cost(factor=256), tree
 def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
                    timeout):
    import pickle, os, signal
    from concurrent.futures import ProcessPoolExecutor, as_completed
    tn_bytes = pickle.dumps(tn)
    if n_workers <= 1:
        return _run_serial_search(
            tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout
        )[1]
    repeats_per = max(1, total_repeats // n_workers)
    best_cost, best_tree = float('inf'), None
    pool = ProcessPoolExecutor(max_workers=n_workers)
    futures = [
        pool.submit(_run_serial_search, tn_bytes, output_inds,
                    repeats_per, seed, num_slices, n_ranks, timeout)
        for seed in range(n_workers)
    ]
    try:
        for fut in as_completed(futures, timeout=timeout + 5):
            try:
                cost, tree = fut.result()
                if cost < best_cost:
                    best_cost, best_tree = cost, tree
            except Exception as e:
                print(f"  [worker failed] {e}")
    except TimeoutError:
        pass
    finally:
        for fut in futures:
            fut.cancel()
        for pid in list(pool._processes.keys()):
            try:
                os.kill(pid, signal.SIGKILL)
            except ProcessLookupError:
                pass
        pool.shutdown(wait=False)
    return best_tree
 def make_circuit(circuit_type, nqubits, nlayers=1):
    c = Circuit(nqubits)
    if circuit_type == "qft":
        from qibo.models import QFT
        return QFT(nqubits)
    elif circuit_type == "variational":
        for layer in range(nlayers):
            for q in range(nqubits):
                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CZ(q, q + 1))
    elif circuit_type == "ghz":
        c.add(gates.H(0))
        for q in range(nqubits - 1):
            c.add(gates.CNOT(q, q + 1))
    elif circuit_type == "brickwork":
        for q in range(nqubits):
            c.add(gates.H(q))
        for layer in range(nlayers):
            offset = layer % 2
            for q in range(offset, nqubits - 1, 2):
                c.add(gates.CNOT(q, q + 1))
                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
    else:
        raise ValueError(f"Unknown circuit: {circuit_type}")
    return c
 def _contract_mpi(tree, arrays, comm, root=0):
    rank = comm.Get_rank()
    size = comm.Get_size()
    is_torch = type(arrays[0]).__module__.startswith("torch")
    result_np = None
    for i in range(rank, tree.multiplicity, size):
        x = tree.contract_slice(arrays, i)
        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
        result_np = x_np if result_np is None else result_np + x_np
    if result_np is None:
        result_np = np.zeros(1, dtype=np.complex128)
    result = np.zeros_like(result_np) if rank == root else None
    comm.Reduce(result_np, result, root=root)
    if rank == root:
        import torch
        return torch.from_numpy(np.asarray(result)) if is_torch else result
    return None
 def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
            load_path=None, save_path=None):
    """Each MPI rank runs serial path search over total_repeats/size trials,
    rank 0 picks the global best, then all ranks contract in parallel."""
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    import torch
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex128)
    # --- path search: each rank serial, gather best to rank 0 ---
    if load_path:
        if rank == 0:
            with open(load_path, "rb") as f:
                saved = pickle.load(f)
            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
            print(f"  [path loaded]  {load_path}")
        else:
            tree = psi = None
            t_search = 0.0
    else:
        rank_repeats = max(1, total_repeats // size)
        t0 = time.time()
        # get TN object first (no contraction), then run parallel search
        psi_tn = qc.to_dense(rehearse="tn")
        local_tree = parallel_search(
            psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
            num_slices=num_slices, n_ranks=size, timeout=600,
        )
        t_search = time.time() - t0
        local_psi = psi_tn
        all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree, local_psi), root=0)
        if rank == 0:
            _, tree, psi = min(all_results, key=lambda x: x[0])
            print(f"  [path search]  {t_search:.3f}s  "
                  f"flops~2^{tree.contraction_cost(log=2):.2f}  "
                  f"size~2^{tree.contraction_width():.2f}  "
                  f"slices={tree.multiplicity}")
            if save_path:
                with open(save_path, "wb") as f:
                    pickle.dump({"tree": tree, "psi": psi}, f)
                print(f"  [path saved]   {save_path}")
        else:
            tree = psi = None
        if save_path:
            t_search = comm.bcast(t_search, root=0)
            return None, t_search
    tree = comm.bcast(tree, root=0)
    psi = comm.bcast(psi, root=0)
    t_search = comm.bcast(t_search, root=0)
    # --- contraction: all ranks work in parallel ---
    import torch
    torch.set_num_threads(max(1, 96 // size))
    arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays]
    t0 = time.time()
    sv = _contract_mpi(tree, arrays, comm, root=0)
    t_contract = time.time() - t0
    if rank == 0:
        print(f"  [contraction]  {t_contract:.3f}s")
        return np.array(sv).reshape(-1), t_search + t_contract
    return None, t_search + t_contract
 def run_mpi_expval(
    circuit,
    nqubits,
    observable=None,
    total_repeats=1024,
    search_workers=1,
    search_timeout=300,
 ):
    """Compute a Hamiltonian expectation value directly from TN via MPI.
    MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps
    path search for each term."""
    import torch
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    qibo.set_backend("qibotn", platform="quimb")
    b = qibo.get_backend()
    b.configure_tn_simulation(ansatz="tn")
    observable = check_observable(observable, nqubits)
    ham_gate_map = extract_gates_and_qubits(observable)
    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
    my_terms = ham_gate_map[rank::size]
    torch.set_num_threads(max(1, 96 // size))
    t0 = time.time()
    my_exp = 0.0 + 0.0j
    for term in my_terms:
        coeff, op, where = _term_to_quimb_operator(term)
        if op is None:
            my_exp += coeff
            continue
        tn = qc.local_expectation_tn(op, where=where)
        if len(tn.outer_inds()) == 0:
            val = complex(tn.contract())
        else:
            tree = parallel_search(
                tn,
                tn.outer_inds(),
                total_repeats,
                n_workers=search_workers,
                num_slices=1,
                n_ranks=size,
                timeout=search_timeout,
            )
            if tree is None:
                raise RuntimeError("Failed to find a contraction tree for expectation TN.")
            arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays]
            acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity))
            val = complex(acc.item() if hasattr(acc, 'item') else acc)
        my_exp += coeff * val
    t_total = time.time() - t0
    all_results = comm.gather(my_exp, root=0)
    if rank == 0:
        total_exp = sum(all_results)
        print(f"\n[TN expval]  time={t_total:.4f}s  expval={total_exp.real:.12f}")
        return np.real_if_close(total_exp), t_total
    return None, t_total
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=30)
    parser.add_argument("--circuit", type=str, default="qft",
                        choices=["qft", "variational", "ghz", "brickwork"])
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--num-slices", type=int, default=1)
    parser.add_argument("--total-repeats", type=int, default=1024)
    parser.add_argument("--search-workers", type=int, default=1)
    parser.add_argument("--search-timeout", type=int, default=300)
    parser.add_argument("--observable-file", type=str, default=None)
    parser.add_argument("--observable-json", type=str, default=None)
    parser.add_argument("--save-path", type=str, default=None)
    parser.add_argument("--load-path", type=str, default=None)
    parser.add_argument("--no-compare", action="store_true")
    parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"])
    args = parser.parse_args()
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    if rank == 0:
        print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
              f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
    np.random.seed(42)
    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
    observable = _load_observable(args.observable_file, args.observable_json)
    if args.mode == "expval":
        try:
            expval, t_total = run_mpi_expval(
                circuit,
                args.nqubits,
                observable=observable,
                total_repeats=args.total_repeats,
                search_workers=args.search_workers,
                search_timeout=args.search_timeout,
            )
        except Exception as e:
            if rank == 0:
                print(f"[FAILED] {e}")
            raise
        if rank == 0:
            np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval))
            if not args.no_compare:
                print("No built-in reference comparison for arbitrary observables.")
        return
    try:
        sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
                              total_repeats=args.total_repeats,
                              load_path=args.load_path, save_path=args.save_path)
    except Exception as e:
        if rank == 0:
            print(f"[FAILED] {e}")
        raise
    if rank == 0 and sv is not None:
        print(f"\n[quimb TN MPI]  time={t_total:.4f}s  shape={sv.shape}")
        np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
        if not args.no_compare:
            from qibotn.bak.benchmark_tn import run_qibojit
            import gc
            np.random.seed(42)
            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
            sv_ref, t_ref = run_qibojit(circuit_ref)
            np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref)
            print(f"[qibojit]       time={t_ref:.4f}s")
            # free memory before loading via mmap for expval comparison
            del sv, sv_ref
            gc.collect()
            from compare_jit_tn_quimb import check_results
            ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy"
            tn_path  = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy"
            check_results(ref_path, tn_path, args.nqubits)
            if t_total > 0:
                print(f"Speedup  : {t_ref/t_total:.2f}x")
 if __name__ == "__main__":
    main()
--- a/tools/check_tree.py
+++ b/tools/check_tree.py
@@ -1,25 +0,0 @@
 """Check contraction tree statistics."""
 import pickle, sys
 path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl"
 with open(path, 'rb') as f:
    tree = pickle.load(f)
 # Intel 8558P: 96 cores, 2.1GHz, AVX-512 (16 FP64/cycle), FMA x2
 # complex128 multiply-add = 6 real FLOPs
 CORES = 96
 FREQ = 2.1e9
 AVX512_FP64 = 16
 TFLOPS = CORES * FREQ * AVX512_FP64 * 2 / 1e12  # ~6.45 TFLOPS real FP64
 COMPLEX_FLOPS = TFLOPS / 6  # complex128 effective
 flops = tree.total_flops()
 slices = tree.multiplicity
 est_seconds = flops * slices / (COMPLEX_FLOPS * 1e12) 
 print(f"File: {path}")
 print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}")
 print(f"Total FLOPs: {flops:.2e}  x{slices} slices = {flops*slices:.2e}")
 print(f"Contraction width: {tree.contraction_width()}")
 print(f"Multiplicity (slices): {slices}")
 print(f"Estimated time (96 cores): {est_seconds:.1f}s  ({est_seconds/3600:.2f}h)")
--- a/tools/compare_vidal_backend_qmatchatea.py
+++ b/tools/compare_vidal_backend_qmatchatea.py
@@ -1,137 +0,0 @@
 """Compare QMatchaTeaBackend with the VidalBackend fast path."""
 from __future__ import annotations
 import argparse
 import json
 import math
 import time
 import numpy as np
 import torch
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Y, Z
 from qibotn.backends.qmatchatea import QMatchaTeaBackend
 from qibotn.backends.vidal import VidalBackend
 def build_circuit(nqubits, nlayers, seed, kind):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        for q in range(nqubits):
            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
        if kind == "brickwall":
            for q in range(0, nqubits - 1, 2):
                circuit.add(gates.CNOT(q, q + 1))
            for q in range(1, nqubits - 1, 2):
                circuit.add(gates.CNOT(q, q + 1))
        elif kind == "shifted-cz":
            for q in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.CZ(q, q + 1))
        elif kind == "reversed-cnot":
            for q in range(0, nqubits - 1, 2):
                circuit.add(gates.CNOT(q + 1, q))
            for q in range(1, nqubits - 1, 2):
                circuit.add(gates.CNOT(q, q + 1))
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
    return circuit
 def build_observable(nqubits, kind):
    form = 0
    if kind == "ring-xz":
        for q in range(nqubits):
            form += 0.5 * X(q) * Z((q + 1) % nqubits)
    elif kind == "open-zz":
        for q in range(nqubits - 1):
            form += Z(q) * Z(q + 1) / (nqubits - 1)
    elif kind == "mixed":
        form += 0.25 * X(0) - 0.5 * Z(nqubits - 1)
        for q in range(0, nqubits - 1, 3):
            form += 0.125 * Y(q) * Y(q + 1)
    else:
        raise ValueError(f"Unknown observable kind {kind!r}.")
    return hamiltonians.SymbolicHamiltonian(form=form)
 def run_backend(backend, circuit, observable):
    start = time.perf_counter()
    value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True)
    return float(np.real(value)), time.perf_counter() - start
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=34)
    parser.add_argument("--nlayers", type=int, default=20)
    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
    parser.add_argument("--torch-threads", type=int, default=32)
    parser.add_argument(
        "--circuit-kind",
        choices=("brickwall", "shifted-cz", "reversed-cnot"),
        default="brickwall",
    )
    parser.add_argument(
        "--observable-kind",
        choices=("ring-xz", "open-zz", "mixed"),
        default="ring-xz",
    )
    parser.add_argument("--reference-file")
    parser.add_argument("--skip-qmatchatea", action="store_true")
    args = parser.parse_args()
    torch.set_num_threads(args.torch_threads)
    circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind)
    observable = build_observable(args.nqubits, args.observable_kind)
    exact = None
    if args.reference_file:
        with open(args.reference_file, "r", encoding="utf-8") as f:
            exact = float(json.load(f)["expectation"])
    print(
        f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} "
        f"circuit={args.circuit_kind} observable={args.observable_kind} "
        f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}"
    )
    if exact is not None:
        print(f"exact={exact:.16e}")
    print("backend value abs_error seconds")
    if not args.skip_qmatchatea:
        qmt = QMatchaTeaBackend()
        qmt.configure_tn_simulation(
            ansatz="MPS",
            max_bond_dimension=args.bond,
            cut_ratio=1e-12,
            svd_control="E!",
            tensor_module=args.tensor_module,
            compile_circuit=True,
            track_memory=False,
        )
        value, seconds = run_backend(qmt, circuit, observable)
        error = float("nan") if exact is None else abs(value - exact)
        print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}")
    vidal = VidalBackend()
    vidal.configure_tn_simulation(
        ansatz="MPS",
        max_bond_dimension=args.bond,
        cut_ratio=1e-12,
        tensor_module=args.tensor_module,
        compile_circuit=True,
        fallback=True,
    )
    value, seconds = run_backend(vidal, circuit, observable)
    error = float("nan") if exact is None else abs(value - exact)
    print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}")
 if __name__ == "__main__":
    main()
--- a/tools/example_tn_case.py
+++ b/tools/example_tn_case.py
@@ -1,33 +0,0 @@
 """Example custom case for tools/run_tn_custom.py."""
 from __future__ import annotations
 import math
 import numpy as np
 from qibo import Circuit, gates
 def build_circuit(nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        for qubit in range(nqubits):
            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
        for qubit in range(layer % 2, nqubits - 1, 2):
            circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
            circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
    return circuit
 def build_observable(nqubits, seed):
    return {
        "terms": [
            {
                "coefficient": 1.0 / max(1, nqubits - 1),
                "operators": [("Z", site), ("Z", site + 1)],
            }
            for site in range(nqubits - 1)
        ]
    }
--- a/tools/inspect_contraction_tree.py
+++ b/tools/inspect_contraction_tree.py
@@ -1,208 +0,0 @@
 """Inspect cotengra contraction trees for dominant torch matmul shapes."""
 from __future__ import annotations
 import argparse
 import importlib
 import math
 import pickle
 from collections import Counter, defaultdict
 from pathlib import Path
 def _prod(values):
    out = 1
    for value in values:
        out *= int(value)
    return out
 def _broadcast_batch(a_batch, b_batch):
    if a_batch == b_batch:
        return _prod(a_batch)
    if not a_batch:
        return _prod(b_batch)
    if not b_batch:
        return _prod(a_batch)
    ndim = max(len(a_batch), len(b_batch))
    a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch)
    b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch)
    return _prod(max(a, b) for a, b in zip(a_batch, b_batch))
 def _load_tree(path, index):
    with Path(path).open("rb") as f:
        payload = pickle.load(f)
    trees = payload["trees"] if isinstance(payload, dict) else payload
    if not isinstance(trees, (list, tuple)):
        trees = [trees]
    return trees[index]
 def _analyze_tree(tree):
    contract_mod = importlib.import_module("cotengra.contract")
    contractions = contract_mod.extract_contractions(tree)
    size_dict = tree.size_dict
    ops = []
    counts = Counter()
    for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions):
        if left is None and right is None:
            counts["preprocess"] += 1
            continue
        left_inds = tree.get_inds(left)
        right_inds = tree.get_inds(right)
        parent_inds = tree.get_inds(parent)
        left_shape = tuple(size_dict[ix] for ix in left_inds)
        right_shape = tuple(size_dict[ix] for ix in right_inds)
        if tdot:
            parsed = contract_mod._parse_tensordot_axes_to_matmul(
                arg,
                left_shape,
                right_shape,
            )
        else:
            parsed = contract_mod._parse_eq_to_batch_matmul(
                arg,
                left_shape,
                right_shape,
            )
        (
            _eq_a,
            _eq_b,
            new_shape_a,
            new_shape_b,
            _new_shape_ab,
            _perm_ab,
            pure_multiplication,
        ) = parsed
        matmul_shape = None
        matmul_flops = 0
        if pure_multiplication:
            kind = "mul"
        else:
            a_shape = tuple(new_shape_a or left_shape)
            b_shape = tuple(new_shape_b or right_shape)
            batch = _broadcast_batch(a_shape[:-2], b_shape[:-2])
            m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1])
            kind = "mm" if batch == 1 else "bmm"
            matmul_shape = (batch, m, k, n)
            matmul_flops = batch * m * k * n
        tree_flops = int(tree.get_flops(parent))
        out_size = int(tree.get_size(parent))
        ops.append(
            {
                "index": op_index,
                "kind": kind,
                "matmul_shape": matmul_shape,
                "matmul_flops": matmul_flops,
                "tree_flops": tree_flops,
                "out_size": out_size,
                "left_shape": left_shape,
                "right_shape": right_shape,
                "left_rank": len(left_inds),
                "right_rank": len(right_inds),
                "out_rank": len(parent_inds),
                "perm": perm,
            }
        )
        counts[kind] += 1
    return contractions, ops, counts
 def _format_log(value, base):
    return "-inf" if value <= 0 else f"{math.log(value, base):.3f}"
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("tree", help="Pickle file containing one tree or {'trees': [...]}.")
    parser.add_argument("--index", type=int, default=0, help="Tree index in the file.")
    parser.add_argument("--top", type=int, default=20, help="Number of top ops to print.")
    parser.add_argument(
        "--dtype-bytes",
        type=int,
        default=8,
        help="Bytes per element for memory estimates, for example 8 for complex64.",
    )
    args = parser.parse_args()
    tree = _load_tree(args.tree, args.index)
    contractions, ops, counts = _analyze_tree(tree)
    nslices = int(getattr(tree, "multiplicity", 1))
    per_slice_flops = sum(op["tree_flops"] for op in ops)
    per_slice_write = sum(op["out_size"] for op in ops)
    max_out = max((op["out_size"] for op in ops), default=0)
    all_flops = per_slice_flops * nslices
    all_write = per_slice_write * nslices
    print(f"tree={args.tree} index={args.index}")
    print(
        "summary "
        f"slices={nslices} contractions={len(contractions)} "
        f"counts={dict(counts)}"
    )
    print(
        "per_slice "
        f"log10_flops={_format_log(per_slice_flops, 10)} "
        f"log10_write={_format_log(per_slice_write, 10)} "
        f"log2_max_output={_format_log(max_out, 2)} "
        f"max_output_gib={max_out * args.dtype_bytes / 1024**3:.6g}"
    )
    print(
        "all_slices "
        f"log10_flops={_format_log(all_flops, 10)} "
        f"log10_write={_format_log(all_write, 10)}"
    )
    print(f"\ntop_{args.top}_ops_by_flops")
    for op in sorted(ops, key=lambda item: item["tree_flops"], reverse=True)[: args.top]:
        print(
            f"op={op['index']} kind={op['kind']} "
            f"flops={op['tree_flops']:.6e} out={op['out_size']:.6e} "
            f"matmul={op['matmul_shape']} "
            f"ranks=({op['left_rank']},{op['right_rank']}->{op['out_rank']}) "
            f"lhs={op['left_shape']} rhs={op['right_shape']}"
        )
    by_shape = defaultdict(lambda: [0, 0, 0])
    for op in ops:
        shape = op["matmul_shape"]
        if shape is None:
            continue
        by_shape[shape][0] += 1
        by_shape[shape][1] += op["tree_flops"]
        by_shape[shape][2] += op["out_size"]
    print(f"\ntop_{args.top}_matmul_shapes_by_flops")
    for shape, (count, flops, out_size) in sorted(
        by_shape.items(),
        key=lambda item: item[1][1],
        reverse=True,
    )[: args.top]:
        print(
            f"shape={shape} count={count} "
            f"flops={flops:.6e} output={out_size:.6e}"
        )
    print(f"\ntop_{args.top}_matmul_shapes_by_count")
    for shape, (count, flops, out_size) in sorted(
        by_shape.items(),
        key=lambda item: item[1][0],
        reverse=True,
    )[: args.top]:
        print(
            f"shape={shape} count={count} "
            f"flops={flops:.6e} output={out_size:.6e}"
        )
 if __name__ == "__main__":
    main()
--- a/tools/manage_tn_dask_cluster.sh
+++ b/tools/manage_tn_dask_cluster.sh
@@ -1,223 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Manage the dask cluster used by TN path search.
 #
 # Defaults target two servers:
 #   scheduler: 10.20.1.103:8786
 #   workers:   10.20.1.103, 10.20.6.101
 #
 # Usage:
 #   tools/manage_tn_dask_cluster.sh start
 #   tools/manage_tn_dask_cluster.sh status
 #   tools/manage_tn_dask_cluster.sh stop
 #
 # Common overrides:
 #   SCHEDULER_HOST=10.20.1.103
 #   WORKER_HOSTS="10.20.1.103 10.20.6.101"
 #   NWORKERS=48
 #   NTHREADS=1
 #   ROOT_DIR=/home/qibo/qibotn
 #   PYTHON_BIN=.venv/bin/python
 ROOT_DIR="${ROOT_DIR:-/home/qibo/qibotn}"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
 SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
 DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
 WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.6.101}"
 NWORKERS="${NWORKERS:-84}"
 NTHREADS="${NTHREADS:-1}"
 MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
 LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
 LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}"
 SSH_BIN="${SSH_BIN:-ssh}"
 DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}"
 DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}"
 DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}"
 SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}"
 is_local_host() {
  local host="$1"
  [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
  [[ "$host" == "$(hostname)" ]] && return 0
  [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
  hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
 }
 run_on_host() {
  local host="$1"
  shift
  local cmd="$*"
  if is_local_host "$host"; then
    bash -lc "$cmd"
  else
    "$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")"
  fi
 }
 start_scheduler() {
  local host="$SCHEDULER_HOST"
  local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log"
  local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
  run_on_host "$host" "
    set -euo pipefail
    cd '$ROOT_DIR'
    mkdir -p '$LOG_DIR'
    if [[ -s '$pid_file' ]]; then
      pid=\$(cat '$pid_file')
      if kill -0 \"\$pid\" 2>/dev/null; then
        echo \"scheduler already running on $host pid=\$pid\"
        exit 0
      fi
    fi
    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
    DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
    DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
    setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \
      --host '$SCHEDULER_HOST' \
      --port '$SCHEDULER_PORT' \
      --dashboard-address '$DASHBOARD_ADDRESS' \
      > '$log' 2>&1 < /dev/null &
    pid=\$!
    echo \"\$pid\" > '$pid_file'
    echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\"
  "
 }
 start_worker() {
  local host="$1"
  local log="$LOG_DIR/worker_${host}.log"
  local pid_file="$LOG_DIR/worker_${host}.pid"
  run_on_host "$host" "
    set -euo pipefail
    cd '$ROOT_DIR'
    mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY'
    if [[ -s '$pid_file' ]]; then
      pid=\$(cat '$pid_file')
      if kill -0 \"\$pid\" 2>/dev/null; then
        echo \"worker already running on $host pid=\$pid\"
        exit 0
      fi
    fi
    TCM_ENABLE=1 \
    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
    DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
    DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
    setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \
      '$SCHEDULER_ADDR' \
      --host '$host' \
      --nworkers '$NWORKERS' \
      --nthreads '$NTHREADS' \
      --memory-limit '$MEMORY_LIMIT' \
      --local-directory '$LOCAL_DIRECTORY' \
      > '$log' 2>&1 < /dev/null &
    pid=\$!
    echo \"\$pid\" > '$pid_file'
    echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\"
  "
 }
 stop_host() {
  local host="$1"
  local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
  local worker_pid_file="$LOG_DIR/worker_${host}.pid"
  run_on_host "$host" "
    set +e
    for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
      [[ -f \"\$pid_file\" ]] || continue
      if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
        continue
      fi
      pid=\$(cat \"\$pid_file\")
      kill \"\$pid\" 2>/dev/null || true
      rm -f \"\$pid_file\"
    done
    pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR'
    pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT'
    true
  "
 }
 status_host() {
  local host="$1"
  local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
  local worker_pid_file="$LOG_DIR/worker_${host}.pid"
  echo "--------------------------------------------------------------------------------"
  echo "host=$host"
  run_on_host "$host" "
    set +e
    for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
      [[ -f \"\$pid_file\" ]] || continue
      if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
        continue
      fi
      pid=\$(cat \"\$pid_file\")
      if kill -0 \"\$pid\" 2>/dev/null; then
        ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers
      else
        echo \"stale pid_file=\$pid_file pid=\$pid\"
      fi
    done
    pgrep -af '[d]istributed.cli.dask' || true
  "
 }
 case "${1:-help}" in
  start)
    start_scheduler
    sleep 2
    for host in $WORKER_HOSTS; do
      start_worker "$host"
    done
    echo
    echo "Dask scheduler: $SCHEDULER_ADDR"
    echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS"
    ;;
  stop)
    for host in $WORKER_HOSTS; do
      stop_host "$host"
    done
    stop_host "$SCHEDULER_HOST"
    ;;
  status)
    status_host "$SCHEDULER_HOST"
    for host in $WORKER_HOSTS; do
      [[ "$host" == "$SCHEDULER_HOST" ]] && continue
      status_host "$host"
    done
    ;;
  restart)
    "$0" stop
    sleep 2
    "$0" start
    ;;
  help|*)
    cat <<EOF
 Usage: tools/manage_tn_dask_cluster.sh [start|stop|restart|status]
 Defaults:
  SCHEDULER_HOST=$SCHEDULER_HOST
  SCHEDULER_PORT=$SCHEDULER_PORT
  WORKER_HOSTS="$WORKER_HOSTS"
  NWORKERS=$NWORKERS
  NTHREADS=$NTHREADS
  ROOT_DIR=$ROOT_DIR
  PYTHON_BIN=$PYTHON_BIN
  DASK_WORKER_TTL="$DASK_WORKER_TTL"
  DASK_TICK_LIMIT=$DASK_TICK_LIMIT
  DASK_LOST_WORKER_TIMEOUT=$DASK_LOST_WORKER_TIMEOUT
 Search command after start:
  TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \\
    --case main1 \\
    --dask-address $SCHEDULER_ADDR \\
    --torch-threads 48 \\
    --dtype complex64 \\
    --tn-search-repeats 2048 \\
    --tn-search-time 300
 EOF
    exit 2
    ;;
 esac
--- a/tools/mpi_torch_thread_probe.py
+++ b/tools/mpi_torch_thread_probe.py
@@ -1,182 +0,0 @@
 #!/usr/bin/env python
 """Probe MPI rank placement and whether torch CPU ops use multiple threads.
 Run this under mpirun/mpiexec to check:
 * which CPUs each rank is allowed to run on,
 * whether torch sees the requested intra-op thread count, and
 * whether a large CPU tensor op actually consumes more CPU time than wall time.
 The script is intentionally small and self-contained so it can be used to debug
 MPI launcher affinity and torch OpenMP behavior independently from the TN code
 path.
 """
 from __future__ import annotations
 import argparse
 import os
 import socket
 import time
 from pathlib import Path
 from mpi4py import MPI
 def _dtype_from_name(name):
    import torch
    mapping = {
        "float32": torch.float32,
        "float64": torch.float64,
        "complex64": torch.complex64,
        "complex128": torch.complex128,
    }
    return mapping[name]
 def _make_tensor(shape, dtype):
    import torch
    if dtype in (torch.complex64, torch.complex128):
        base = torch.float32 if dtype == torch.complex64 else torch.float64
        return torch.complex(
            torch.randn(shape, dtype=base),
            torch.randn(shape, dtype=base),
        )
    return torch.randn(shape, dtype=dtype)
 def _bench(label, fn, iters, warmup=2):
    for _ in range(warmup):
        fn()
    start_wall = time.perf_counter()
    start_cpu = time.process_time()
    checksum = 0.0
    for _ in range(iters):
        value = fn()
        checksum += float(value)
    wall = time.perf_counter() - start_wall
    cpu = time.process_time() - start_cpu
    ratio = cpu / wall if wall > 0 else float("inf")
    print(
        f"{label} wall={wall:.3f}s cpu={cpu:.3f}s cpu_over_wall={ratio:.2f} "
        f"checksum={checksum:.6e}",
        flush=True,
    )
 def _visible_numa_nodes():
    nodes = []
    for path in sorted(Path("/sys/devices/system/node").glob("node[0-9]*")):
        cpulist = path / "cpulist"
        if cpulist.exists():
            nodes.append(f"{path.name}:{cpulist.read_text(encoding='utf-8').strip()}")
    return ",".join(nodes) if nodes else "unknown"
 def _dtype_nbytes(name):
    return {
        "float32": 4,
        "float64": 8,
        "complex64": 8,
        "complex128": 16,
    }[name]
 def _format_gib(nbytes):
    return f"{nbytes / (1024 ** 3):.2f}GiB"
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--threads", type=int, default=48)
    parser.add_argument("--n", type=int, default=4096)
    parser.add_argument("--iters", type=int, default=4)
    parser.add_argument("--dtype", choices=("float32", "float64", "complex64", "complex128"), default="float32")
    parser.add_argument("--op", choices=("matmul", "tensordot", "both"), default="both")
    parser.add_argument(
        "--affinity-only",
        action="store_true",
        help="Print MPI/torch placement diagnostics without allocating tensors.",
    )
    args = parser.parse_args()
    os.environ.setdefault("OMP_NUM_THREADS", str(args.threads))
    os.environ.setdefault("MKL_NUM_THREADS", str(args.threads))
    os.environ.setdefault("OMP_PROC_BIND", "close")
    os.environ.setdefault("OMP_PLACES", "cores")
    import torch
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    torch.set_num_threads(args.threads)
    try:
        torch.set_num_interop_threads(1)
    except Exception:
        pass
    dtype = _dtype_from_name(args.dtype)
    affinity = sorted(os.sched_getaffinity(0))
    allowed_list = ""
    try:
        with open("/proc/self/status", encoding="utf-8") as f:
            for line in f:
                if line.startswith("Cpus_allowed_list:"):
                    allowed_list = line.split(":", 1)[1].strip()
                    break
    except OSError:
        pass
    print(
        f"rank={rank}/{size} host={socket.gethostname()} pid={os.getpid()} "
        f"affinity_len={len(affinity)} allowed={allowed_list} "
        f"torch_threads={torch.get_num_threads()} "
        f"torch_interop={torch.get_num_interop_threads()} "
        f"OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')} "
        f"MKL_NUM_THREADS={os.environ.get('MKL_NUM_THREADS')} "
        f"OMP_PROC_BIND={os.environ.get('OMP_PROC_BIND')} "
        f"OMP_PLACES={os.environ.get('OMP_PLACES')} "
        f"visible_numa={_visible_numa_nodes()}",
        flush=True,
    )
    if rank == 0:
        print(torch.__config__.parallel_info(), flush=True)
        input_bytes = args.n * args.n * _dtype_nbytes(args.dtype)
        min_live_bytes = 3 * input_bytes
        print(
            f"matrix_n={args.n} dtype={args.dtype} "
            f"one_matrix={_format_gib(input_bytes)} "
            f"approx_min_live_per_rank={_format_gib(min_live_bytes)} "
            f"approx_min_live_all_ranks={_format_gib(min_live_bytes * size)}",
            flush=True,
        )
    comm.Barrier()
    if args.affinity_only:
        return
    a = _make_tensor((args.n, args.n), dtype)
    b = _make_tensor((args.n, args.n), dtype)
    def run_matmul():
        value = (a @ b).sum()
        return value.real.item() if value.is_complex() else value.item()
    def run_tensordot():
        value = torch.tensordot(a, b, dims=1)
        value = value.sum()
        return value.real.item() if value.is_complex() else value.item()
    if args.op in ("matmul", "both"):
        _bench("matmul", run_matmul, args.iters)
    if args.op in ("tensordot", "both"):
        _bench("tensordot", run_tensordot, args.iters)
 if __name__ == "__main__":
    main()
--- a/tools/mps_contest_runner.py
+++ b/tools/mps_contest_runner.py
@@ -1,313 +0,0 @@
 #!/usr/bin/env python
 """Contest-style multi-node Vidal/MPS expectation runner."""
 from __future__ import annotations
 import argparse
 import math
 import sys
 import time
 from dataclasses import dataclass
 from pathlib import Path
 import numpy as np
 from mpi4py import MPI
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Y, Z
 ROOT = Path(__file__).resolve().parents[1]
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 from qibotn.backends.vidal import VidalBackend  # noqa: E402
 from qibotn.expectation_runner import exact_for_observable  # noqa: E402
@dataclass(frozen=True)
 class CaseSpec:
    circuit_kind: str
    observables: tuple[str, ...]
    nqubits: int
    nlayers: int
    bond: int | None
    seed: int
 CASES = {
    "main1": CaseSpec(
        circuit_kind="reversed_cnot",
        observables=("ring_xz",),
        nqubits=128,
        nlayers=24,
        bond=512,
        seed=31001,
    ),
    "main2": CaseSpec(
        circuit_kind="rxx_rzz",
        observables=("open_zz", "range2_xx", "mixed_local"),
        nqubits=128,
        nlayers=32,
        bond=1024,
        seed=31002,
    ),
    "strong": CaseSpec(
        circuit_kind="scramble",
        observables=("ring_xz", "long_z_string", "dense3_spread"),
        nqubits=256,
        nlayers=48,
        bond=2048,
        seed=41001,
    ),
 }
 def optional_int(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return int(text)
 def optional_float(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return float(text)
 def format_optional(value, fmt="g"):
    return "None" if value is None else format(value, fmt)
 def set_torch_threads(nthreads):
    try:
        import torch
        torch.set_num_threads(nthreads)
    except Exception:
        pass
 def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
    for qubit in range(nqubits):
        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
        if include_rx:
            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
 def build_circuit(kind, nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        if kind == "reversed_cnot":
            add_single_qubit_layer(circuit, nqubits, rng)
            for qubit in range(0, nqubits - 1, 2):
                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
                circuit.add(gate)
            for qubit in range(1, nqubits - 1, 2):
                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
                circuit.add(gate)
        elif kind == "rxx_rzz":
            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
        elif kind == "scramble":
            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                if layer % 5 == 4:
                    circuit.add(gates.SWAP(qubit, qubit + 1))
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
    return circuit
 def dense_observable(nqubits, qubits, seed, dim):
    del nqubits
    rng = np.random.default_rng(seed)
    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
    matrix = (raw + raw.conj().T) / 2.0
    matrix = matrix / np.linalg.norm(matrix)
    return {"matrix": matrix, "qubits": list(qubits)}
 def observable(kind, nqubits, seed):
    q1 = nqubits // 4
    q2 = nqubits // 2
    q3 = (3 * nqubits) // 4
    last = nqubits - 1
    if kind == "boundary_ZZ_q1":
        return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
    if kind == "boundary_ZZ_q2":
        return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
    if kind == "boundary_ZZ_q3":
        return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
    if kind == "long_Z_5_sites":
        return hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last))
    if kind == "mixed_XZYZX":
        return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
    if kind == "ring_xz":
        form = 0
        for qubit in range(nqubits):
            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "open_zz":
        form = 0
        for qubit in range(nqubits - 1):
            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "range2_xx":
        form = 0
        for qubit in range(nqubits - 2):
            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "mixed_local":
        form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "complex_iZ0":
        return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
    if kind == "dense2_mid":
        return dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
    if kind == "dense3_spread":
        return dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
    raise ValueError(f"Unknown observable kind {kind!r}.")
 def selected_observables(args, case):
    if args.observables:
        return tuple(args.observables)
    if args.obs_filter:
        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
    return case.observables
 def apply_case_defaults(args):
    case = CASES[args.case]
    if args.nqubits is None:
        args.nqubits = case.nqubits
    if args.nlayers is None:
        args.nlayers = case.nlayers
    if args.bond == "case-default":
        args.bond = case.bond
    if args.seed is None:
        args.seed = case.seed
    args.observables = selected_observables(args, case)
 def run_case(args):
    set_torch_threads(args.torch_threads)
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    case = CASES[args.case]
    circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
    if rank == 0:
        print("=" * 88, flush=True)
        print(
            "backend=vidal_mps "
            f"case={args.case} circuit={case.circuit_kind} ranks={size} "
            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
            f"bond={format_optional(args.bond)} cut_ratio={format_optional(args.cut_ratio)} "
            f"torch_threads={args.torch_threads} seed={args.seed} "
            f"observables={','.join(args.observables)}",
            flush=True,
        )
        print("observable exact value abs_error rel_error seconds trunc_sum trunc_max status", flush=True)
    for obs_name in args.observables:
        obs = observable(obs_name, args.nqubits, args.seed)
        exact = None
        if args.exact and rank == 0:
            if args.nqubits > args.exact_max_qubits:
                raise ValueError(
                    f"--exact is limited to {args.exact_max_qubits} qubits by default."
                )
            exact = exact_for_observable(circuit, obs, args.nqubits)
        backend = VidalBackend()
        backend.configure_tn_simulation(
            max_bond_dimension=args.bond,
            cut_ratio=args.cut_ratio,
            tensor_module="torch",
            mpi_approach="CT",
            mpi_num_procs=size,
            fallback=False,
        )
        comm.Barrier()
        start = time.perf_counter()
        try:
            value = backend.expectation(
                circuit,
                obs,
                preprocess=True,
                compile_circuit=False,
            )
            status = "ok"
        except Exception as exc:
            value = np.nan
            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
        seconds = time.perf_counter() - start
        if rank == 0:
            abs_error = float("nan") if exact is None else abs(value - exact)
            rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
            exact_text = "nan" if exact is None else f"{exact:.16e}"
            print(
                f"{obs_name} {exact_text} {value!r} "
                f"{abs_error:.6e} {rel_error:.6e} {seconds:.3f} "
                f"{backend.last_truncation_error:.6e} "
                f"{backend.last_max_truncation_error:.6e} {status}",
                flush=True,
            )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("mode", choices=("run", "validate", "list"))
    parser.add_argument("--case", choices=sorted(CASES), default="main1")
    parser.add_argument("--observables", nargs="+")
    parser.add_argument("--obs-filter", default="")
    parser.add_argument("--nqubits", type=int)
    parser.add_argument("--nlayers", type=int)
    parser.add_argument("--bond", "--bonds", dest="bond", default="case-default")
    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
    parser.add_argument("--seed", type=int)
    parser.add_argument("--torch-threads", type=int, default=8)
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
    args = parser.parse_args()
    if args.mode == "list":
        for name, case in CASES.items():
            print(
                f"{name}: circuit={case.circuit_kind} "
                f"observables={','.join(case.observables)} "
                f"nqubits={case.nqubits} nlayers={case.nlayers} "
                f"bond={case.bond} seed={case.seed}"
            )
        return
    apply_case_defaults(args)
    if isinstance(args.bond, str):
        args.bond = optional_int(args.bond)
    if args.mode == "validate":
        args.exact = True
        args.nqubits = min(args.nqubits, args.exact_max_qubits)
    run_case(args)
 if __name__ == "__main__":
    main()
--- a/tools/profile_vidal_chrome.py
+++ b/tools/profile_vidal_chrome.py
@@ -1,72 +0,0 @@
 """Chrome trace profiler for the VidalBackend fast path."""
 from __future__ import annotations
 import argparse
 from pathlib import Path
 import torch
 from torch.profiler import ProfilerActivity, profile
 from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms
 from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=34)
    parser.add_argument("--nlayers", type=int, default=20)
    parser.add_argument("--bond", type=int, default=512)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--torch-threads", type=int, default=32)
    parser.add_argument("--cut-ratio", type=float, default=1e-12)
    parser.add_argument("--profile-memory", action="store_true")
    parser.add_argument("--rows", type=int, default=60)
    args = parser.parse_args()
    torch.set_num_threads(args.torch_threads)
    prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}"
    trace_path = Path(f"{prefix}.json")
    table_path = Path(f"{prefix}.txt")
    trace_path.parent.mkdir(parents=True, exist_ok=True)
    circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed)
    observable = terms_to_dict(observable_terms("ring_xz", args.nqubits))
    config = ExpectationConfig(
        ansatz="mps",
        bond=args.bond,
        cut_ratio=args.cut_ratio,
        tensor_module="torch",
        torch_threads=args.torch_threads,
    )
    print(
        f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} "
        f"bond={args.bond} threads={args.torch_threads}"
    )
    with profile(
        activities=[ProfilerActivity.CPU],
        record_shapes=args.profile_memory,
        profile_memory=args.profile_memory,
        with_stack=args.profile_memory,
    ) as prof:
        result = run_cpu_expectation(circuit, observable, config)
    table = (
        f"expval={result.value:.16e}\n\n"
        f"# sorted by self_cpu_time_total\n"
        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n"
        f"# sorted by cpu_time_total\n"
        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n"
    )
    print(table, end="")
    table_path.write_text(table, encoding="utf-8")
    prof.export_chrome_trace(str(trace_path))
    print(f"trace={trace_path}\ntable={table_path}")
 if __name__ == "__main__":
    main()
--- a/tools/qibojit_reference_expectation.py
+++ b/tools/qibojit_reference_expectation.py
@@ -1,109 +0,0 @@
 """Compute and cache a qibojit state-vector reference for the ring-XZ observable."""
 import argparse
 import json
 import math
 import time
 from pathlib import Path
 import numpy as np
 import qibo
 from qibo import Circuit, gates
 def build_circuit(nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for _ in range(nlayers):
        for qubit in range(nqubits):
            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
        for qubit in range(0, nqubits - 1, 2):
            circuit.add(gates.CNOT(qubit, qubit + 1))
        for qubit in range(1, nqubits - 1, 2):
            circuit.add(gates.CNOT(qubit, qubit + 1))
    return circuit
 def ring_xz_expectation(state, nqubits, chunk_size):
    value = 0.0
    for qubit in range(nqubits):
        next_qubit = (qubit + 1) % nqubits
        x_flip = 1 << (nqubits - 1 - qubit)
        z_shift = nqubits - 1 - next_qubit
        term = 0.0
        for start in range(0, state.size, chunk_size):
            stop = min(start + chunk_size, state.size)
            indices = np.arange(start, stop, dtype=np.int64)
            z_bit = (indices >> z_shift) & 1
            z_phase = 1 - 2 * z_bit
            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
        value += 0.5 * term
    return float(value)
 def default_output_path(nqubits, nlayers, seed):
    return Path("references") / (
        f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json"
    )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=32)
    parser.add_argument("--nlayers", type=int, default=3)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--output")
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--allow-large", action="store_true")
    parser.add_argument("--max-state-gb", type=float, default=32.0)
    parser.add_argument("--chunk-size", type=int, default=1 << 20)
    args = parser.parse_args()
    output = Path(args.output) if args.output else default_output_path(
        args.nqubits, args.nlayers, args.seed
    )
    if output.exists() and not args.force:
        with open(output, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(f"loaded {output}")
        print(f"expectation={float(data['expectation']):.16e}")
        return
    state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3)
    if state_gb > args.max_state_gb and not args.allow_large:
        raise MemoryError(
            f"Estimated state vector alone is {state_gb:.1f} GiB. "
            "Pass --allow-large after confirming the node has enough memory."
        )
    qibo.set_backend("qibojit")
    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
    start = time.perf_counter()
    state = circuit().state(numpy=True).reshape(-1)
    expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size)
    elapsed = time.perf_counter() - start
    data = {
        "backend": "qibojit",
        "observable": "0.5 * sum_i X_i Z_((i+1) mod n)",
        "nqubits": args.nqubits,
        "nlayers": args.nlayers,
        "seed": args.seed,
        "expectation": expectation,
        "seconds": elapsed,
        "state_vector_gib_estimate": state_gb,
    }
    output.parent.mkdir(parents=True, exist_ok=True)
    with open(output, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, sort_keys=True)
        f.write("\n")
    print(f"saved {output}")
    print(f"expectation={expectation:.16e}")
    print(f"seconds={elapsed:.3f}")
 if __name__ == "__main__":
    main()
--- a/tools/qibotn_torch_mt_env.sh
+++ b/tools/qibotn_torch_mt_env.sh
@@ -1,22 +0,0 @@
 #!/usr/bin/env bash
 # Shared runtime setup for CPU torch TN/MPS runs.
 #
 # This makes AOCL BLIS use the multithreaded library when available, which is
 # required for complex64 tensordot/cgemm to actually use all cores on this host.
 QIBOTN_BLIS_MT="${QIBOTN_BLIS_MT:-/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5}"
 export BLIS_NUM_THREADS="${BLIS_NUM_THREADS:-${OMP_NUM_THREADS:-1}}"
 if [[ -f "$QIBOTN_BLIS_MT" ]]; then
  case ":${LD_PRELOAD:-}:" in
    *":$QIBOTN_BLIS_MT:"*)
      ;;
    *)
      export LD_PRELOAD="${LD_PRELOAD:+$LD_PRELOAD:}$QIBOTN_BLIS_MT"
      ;;
  esac
 fi
 export OMP_PROC_BIND="${OMP_PROC_BIND:-close}"
 export OMP_PLACES="${OMP_PLACES:-cores}"
--- a/tools/run_cpu_large_cases.sh
+++ b/tools/run_cpu_large_cases.sh
@@ -1,128 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Large CPU expectation benchmarks for two-server runs.
 #
 # Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM
 # each.  Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as
 # needed.
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT_DIR"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 MPIEXEC="${MPIEXEC:-mpiexec}"
 HOSTFILE="${HOSTFILE:-hostfile}"
 MPS_RANKS="${MPS_RANKS:-8}"
 MPS_THREADS="${MPS_THREADS:-12}"
 TN_RANKS="${TN_RANKS:-12}"
 TN_THREADS="${TN_THREADS:-8}"
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
 source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 run_mpi() {
  local ranks="$1"
  shift
  "$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@"
 }
 run_case() {
  local title="$1"
  shift
  echo
  echo "================================================================================"
  echo "$title"
  echo "================================================================================"
  echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
  echo "$*"
  "$@"
 }
 case "${1:-help}" in
  smoke)
    run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \
      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
        --mpi --mps \
        --nqubits "${MPS_SMOKE_NQ:-40}" \
        --nlayers "${MPS_SMOKE_LAYERS:-30}" \
        --bond "${MPS_SMOKE_BOND:-2048}" \
        --torch-threads "$MPS_THREADS" \
        --circuits brickwall_cnot reversed_cnot shifted_cz \
        --observables ring_xz open_zz range2_xx
    run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \
      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
        --mpi \
        --nqubits "${TN_SMOKE_NQ:-32}" \
        --nlayers "${TN_SMOKE_LAYERS:-16}" \
        --torch-threads "$TN_THREADS" \
        --circuits brickwall_cnot shifted_cz rxx_rzz \
        --observables ring_xz open_zz range2_xx \
        --tn-target-slices "${TN_SMOKE_SLICES:-12}"
    ;;
  mps-long)
    run_case "MPS MPI long: n=64 layers=48 bond=4096" \
      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
        --mpi --mps \
        --nqubits "${MPS_LONG_NQ:-64}" \
        --nlayers "${MPS_LONG_LAYERS:-48}" \
        --bond "${MPS_LONG_BOND:-4096}" \
        --torch-threads "$MPS_THREADS" \
        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
        --observables ring_xz open_zz mixed_local range2_xx
    ;;
  mps-pressure)
    run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \
      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
        --mpi --mps \
        --nqubits "${MPS_PRESSURE_NQ:-80}" \
        --nlayers "${MPS_PRESSURE_LAYERS:-64}" \
        --bond "${MPS_PRESSURE_BOND:-4096}" \
        --torch-threads "$MPS_THREADS" \
        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \
        --observables ring_xz open_zz mixed_local range2_xx long_z_string
    ;;
  tn-long)
    run_case "TN MPI long: n=36 layers=20 target_slices=24" \
      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
        --mpi \
        --nqubits "${TN_LONG_NQ:-36}" \
        --nlayers "${TN_LONG_LAYERS:-20}" \
        --torch-threads "$TN_THREADS" \
        --circuits brickwall_cnot shifted_cz rxx_rzz \
        --observables ring_xz open_zz range2_xx \
        --tn-target-slices "${TN_LONG_SLICES:-24}"
    ;;
  all)
    "$0" smoke
    "$0" mps-long
    "$0" tn-long
    ;;
  help|*)
    cat >&2 <<'EOF'
 Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all]
 Common overrides:
  HOSTFILE=hostfile
  PYTHON_BIN=.venv/bin/python
  MPIEXEC=mpiexec
  MPS_RANKS=8 MPS_THREADS=12
  TN_RANKS=12 TN_THREADS=8
 Scale overrides:
  MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096
  MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096
  TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24
 EOF
    exit 2
    ;;
 esac
--- a/tools/run_cpu_single_cases.sh
+++ b/tools/run_cpu_single_cases.sh
@@ -1,149 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Single-node CPU scale probes for expectation benchmarks.
 #
 # Intended for one 96-core / ~500 GiB RAM node.  The default "probe" mode runs
 # moderate MPS and TN cases first.  Larger modes are available after checking
 # runtime and memory from the probe output.
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT_DIR"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 PYTHON_FLAGS="${PYTHON_FLAGS:--u}"
 MPIEXEC="${MPIEXEC:-mpiexec}"
 TIME_BIN="${TIME_BIN:-/usr/bin/time}"
 MPS_RANKS="${MPS_RANKS:-8}"
 MPS_THREADS="${MPS_THREADS:-12}"
 TN_RANKS="${TN_RANKS:-8}"
 TN_THREADS="${TN_THREADS:-12}"
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
 source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 estimate_mps_memory() {
  local nqubits="$1"
  local bond="$2"
  "$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY'
 import sys
 n = int(sys.argv[1])
 chi = int(sys.argv[2])
 ranks = int(sys.argv[3])
 resident = n * 2 * chi * chi * 16
 per_rank = resident / ranks
 print(
    "MPS rough resident memory: "
    f"total={resident / 1024**3:.1f} GiB "
    f"per_rank={per_rank / 1024**3:.1f} GiB "
    "(temporary eig/SVD workspaces are additional)"
 )
 PY
 }
 run_timed() {
  echo
  echo "--------------------------------------------------------------------------------"
  echo "$*"
  echo "--------------------------------------------------------------------------------"
  "$TIME_BIN" -v "$@"
 }
 run_mps_case() {
  local label="$1"
  local nqubits="$2"
  local nlayers="$3"
  local bond="$4"
  shift 4
  echo
  echo "================================================================================"
  echo "$label"
  echo "================================================================================"
  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
  echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS"
  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
  estimate_mps_memory "$nqubits" "$bond"
  run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
    --mpi --mps \
    --nqubits "$nqubits" \
    --nlayers "$nlayers" \
    --bond "$bond" \
    --torch-threads "$MPS_THREADS" \
    "$@"
 }
 run_tn_case() {
  local label="$1"
  local nqubits="$2"
  local nlayers="$3"
  shift 3
  echo
  echo "================================================================================"
  echo "$label"
  echo "================================================================================"
  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
  echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS"
  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
  echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high."
  run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
    --mpi \
    --nqubits "$nqubits" \
    --nlayers "$nlayers" \
    --torch-threads "$TN_THREADS" \
    "$@"
 }
 case "${1:-help}" in
  probe)
    run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \
      --circuits brickwall_cnot \
      --observables ring_xz
    run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \
      --circuits brickwall_cnot \
      --observables ring_xz \
      --tn-target-slices 8
    ;;
  mps-medium)
    run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \
      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz mixed_local range2_xx
    ;;
  mps-long)
    run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \
      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz mixed_local range2_xx
    ;;
  tn-medium)
    run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \
      --circuits brickwall_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz range2_xx \
      --tn-target-slices 16
    ;;
  tn-long)
    run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \
      --circuits brickwall_cnot shifted_cz rxx_rzz \
      --observables ring_xz open_zz range2_xx \
      --tn-target-slices 32
    ;;
  help|*)
    cat >&2 <<'EOF'
 Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long]
 Common overrides:
  PYTHON_BIN=.venv/bin/python
  MPIEXEC=mpiexec
  MPS_RANKS=8 MPS_THREADS=12
  TN_RANKS=8 TN_THREADS=12
  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
 EOF
    exit 2
    ;;
 esac
--- a/tools/run_tn_custom.py
+++ b/tools/run_tn_custom.py
@@ -1,243 +0,0 @@
 #!/usr/bin/env python
 """Run TN expectation for a user-provided circuit and observable.
 The case module should define:
    def build_circuit(nqubits, nlayers, seed): ...
    def build_observable(nqubits, seed): ...
 ``build_observable`` may return a Qibo SymbolicHamiltonian/form or the qibotn
 dict form:
    {"terms": [
        {"coefficient": 1.0, "operators": [("X", 0), ("Z", 1)]},
    ]}
 For a single repeated Pauli string, pass ``--pauli-pattern`` instead of
 defining ``build_observable``.
 """
 from __future__ import annotations
 import argparse
 import importlib.util
 import inspect
 import json
 import sys
 from pathlib import Path
 ROOT = Path(__file__).resolve().parents[1]
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 from qibotn.expectation_runner import (  # noqa: E402
    ExpectationConfig,
    exact_for_observable,
    run_cpu_expectation,
 )
 def optional_int(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return int(text)
 def optional_float(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return float(text)
 def load_module(path):
    path = Path(path).resolve()
    spec = importlib.util.spec_from_file_location(path.stem, path)
    if spec is None or spec.loader is None:
        raise RuntimeError(f"Cannot import case module from {path}.")
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module
 def call_builder(fn, **kwargs):
    sig = inspect.signature(fn)
    if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
        return fn(**kwargs)
    accepted = {
        name: value
        for name, value in kwargs.items()
        if name in sig.parameters
    }
    return fn(**accepted)
 def load_observable(args, module):
    if args.pauli_pattern:
        return {"pauli_string_pattern": args.pauli_pattern}
    if args.observable_json:
        with Path(args.observable_json).open() as f:
            return json.load(f)
    if hasattr(module, "build_observable"):
        return call_builder(
            module.build_observable,
            nqubits=args.nqubits,
            nlayers=args.nlayers,
            seed=args.seed,
        )
    if hasattr(module, "OBSERVABLE"):
        return module.OBSERVABLE
    raise ValueError(
        "No observable supplied. Define build_observable/OBSERVABLE in the case "
        "module, or pass --pauli-pattern / --observable-json."
    )
 def build_parallel_opts(args):
    slicing_opts = {}
    if args.tn_target_slices is not None:
        slicing_opts["target_slices"] = args.tn_target_slices
    if args.tn_target_size is not None:
        slicing_opts["target_size"] = args.tn_target_size
    opts = {
        "slicing_opts": slicing_opts or None,
        "search_workers": args.tn_search_workers or args.torch_threads,
        "max_repeats": args.tn_search_repeats,
        "max_time": args.tn_search_time,
        "print_stats": not args.no_tn_stats,
    }
    if args.tn_search_backend is not None:
        opts["search_backend"] = args.tn_search_backend
    if args.dask_address is not None:
        opts["dask_address"] = args.dask_address
    if args.dask_close_workers:
        opts["dask_close_workers"] = True
    if args.tn_save_tree is not None:
        opts["save_tree_path"] = args.tn_save_tree
    if args.tn_load_tree is not None:
        opts["load_tree_path"] = args.tn_load_tree
    if args.tn_search_only:
        opts["search_only"] = True
    return opts
 def main():
    parser = argparse.ArgumentParser(
        description="Run CPU TN expectation for a custom qibo circuit module."
    )
    parser.add_argument("case_module", help="Python file defining build_circuit.")
    parser.add_argument("--nqubits", type=int, required=True)
    parser.add_argument("--nlayers", type=int, default=0)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--mpi", action="store_true")
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
    parser.add_argument("--torch-threads", type=int, default=8)
    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
    parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex128")
    parser.add_argument("--pauli-pattern")
    parser.add_argument("--observable-json")
    parser.add_argument("--tn-target-slices", type=int)
    parser.add_argument("--tn-target-size", type=int, default=2**32)
    parser.add_argument("--tn-search-workers", type=int)
    parser.add_argument("--tn-search-repeats", type=int, default=128)
    parser.add_argument("--tn-search-time", type=float, default=60.0)
    parser.add_argument("--tn-search-backend", choices=("processpool", "dask"))
    parser.add_argument("--dask-address")
    parser.add_argument("--dask-close-workers", action="store_true")
    parser.add_argument("--tn-save-tree")
    parser.add_argument("--tn-load-tree")
    parser.add_argument("--tn-search-only", action="store_true")
    parser.add_argument("--no-tn-stats", action="store_true")
    args = parser.parse_args()
    rank = 0
    if args.mpi:
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
    module = load_module(args.case_module)
    if not hasattr(module, "build_circuit"):
        raise ValueError("case_module must define build_circuit.")
    circuit = call_builder(
        module.build_circuit,
        nqubits=args.nqubits,
        nlayers=args.nlayers,
        seed=args.seed,
    )
    observable = load_observable(args, module)
    config = ExpectationConfig(
        ansatz="tn",
        mpi=args.mpi,
        bond=args.bond,
        cut_ratio=args.cut_ratio,
        tensor_module="torch",
        quimb_backend=args.quimb_backend,
        dtype=args.dtype,
        torch_threads=args.torch_threads,
        parallel_opts=build_parallel_opts(args),
    )
    if rank == 0:
        mode = "MPI" if args.mpi else "serial"
        print(
            f"backend=cpu ansatz=TN mode={mode} case={Path(args.case_module).name} "
            f"nqubits={args.nqubits} nlayers={args.nlayers} seed={args.seed} "
            f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
            f"torch_threads={args.torch_threads}",
            flush=True,
        )
        print("observable exact value abs_error rel_error seconds", flush=True)
    exact = None
    if args.exact and rank == 0:
        if args.nqubits > args.exact_max_qubits:
            raise ValueError(
                f"--exact is limited to {args.exact_max_qubits} qubits by default."
            )
        exact = exact_for_observable(circuit, observable, args.nqubits)
    result = run_cpu_expectation(circuit, observable, config)
    if args.mpi and result.rank != 0:
        return
    abs_error = float("nan") if exact is None else abs(result.value - exact)
    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
    exact_text = "nan" if exact is None else f"{exact:.16e}"
    print(
        f"custom {exact_text} {result.value:.16e} "
        f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}",
        flush=True,
    )
    for stat in result.parallel_stats or ():
        cost = stat["path_cost"]
        search_stats = stat.get("search_stats", {})
        print(
            "tn_term_summary "
            f"term={stat.get('term_index', 0)} "
            f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
            f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
            f"completed_trials={search_stats.get('completed_trials', 'na')} "
            f"finite_trials={search_stats.get('finite_trials', 'na')} "
            f"failed_trials={search_stats.get('failed_trials', 'na')} "
            f"requested_trials={search_stats.get('requested_trials', 'na')} "
            f"best_score={search_stats.get('best_score', float('nan')):.6g} "
            f"slices={cost.get('slices')} "
            f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
            f"log10_write={cost.get('log10_write', float('nan')):.3f} "
            f"log2_size={cost.get('log2_size', float('nan')):.3f} "
            f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
            f"rank_slices={stat.get('rank_slices')}",
            flush=True,
        )
 if __name__ == "__main__":
    main()
--- a/tools/run_tn_dask_mpi_all.sh
+++ b/tools/run_tn_dask_mpi_all.sh
@@ -1,260 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT_DIR"
 CASE="${CASE:-main1}"
 OBSERVABLES="${OBSERVABLES:-long_z_string}"
 NQUBITS="${NQUBITS:-34}"
 NLAYERS="${NLAYERS:-20}"
 TORCH_THREADS="${TORCH_THREADS:-48}"
 SEARCH_REPEATS="${SEARCH_REPEATS:-2048}"
 SEARCH_TIME="${SEARCH_TIME:-300}"
 TN_TARGET_SIZE="${TN_TARGET_SIZE:-17179869184}"
 TN_TARGET_SLICES="${TN_TARGET_SLICES:-}"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 DTYPE="${DTYPE:-complex64}"
 TREE_DIR="${TREE_DIR:-trees/contest_tn}"
 DASK_ADDRESS="${DASK_ADDRESS:-tcp://10.20.1.103:8786}"
 DASK_EXPECTED_WORKERS="${DASK_EXPECTED_WORKERS:-}"
 DASK_WAIT_FOR_WORKERS="${DASK_WAIT_FOR_WORKERS:-1}"
 DASK_WAIT_TIMEOUT="${DASK_WAIT_TIMEOUT:-600}"
 TN_DEBUG_TRIALS="${TN_DEBUG_TRIALS:-0}"
 MPIEXEC="${MPIEXEC:-mpirun}"
 MPIEXEC_FULL="${MPIEXEC_FULL:-}"
 MPI_HOSTS="${MPI_HOSTS:-}"
 MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
 MPI_RANKS="${MPI_RANKS:-}"
 MPI_PE="${MPI_PE:-$TORCH_THREADS}"
 MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
 MPI_BIND_TO="${MPI_BIND_TO:-core}"
 MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
 MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
 TN_CONTRACT_ENV_CHECK="${TN_CONTRACT_ENV_CHECK:-1}"
 SYNC_TREES="${SYNC_TREES:-1}"
 SYNC_HOSTS="${SYNC_HOSTS:-${WORKER_HOSTS:-}}"
 SSH_BIN="${SSH_BIN:-ssh}"
 DASK_CLUSTER_MANAGED="${DASK_CLUSTER_MANAGED:-0}"
 export TCM_ENABLE="${TCM_ENABLE:-1}"
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
 source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 tn_slice_args=(--tn-target-size "$TN_TARGET_SIZE")
 if [[ -n "$TN_TARGET_SLICES" ]]; then
  tn_slice_args+=(--tn-target-slices "$TN_TARGET_SLICES")
 fi
 cleanup_dask_cluster() {
  local status=$?
  if [[ "$DASK_CLUSTER_MANAGED" == "1" ]]; then
    set +e
    tools/manage_tn_dask_cluster.sh stop >/dev/null 2>&1 || true
  fi
  exit "$status"
 }
 trap cleanup_dask_cluster EXIT INT TERM HUP
 sum_host_slots() {
  local hosts="$1"
  local total=0
  local item slots
  IFS=',' read -r -a host_items <<< "$hosts"
  for item in "${host_items[@]}"; do
    if [[ "$item" == *:* ]]; then
      slots="${item##*:}"
    else
      slots=1
    fi
    total=$((total + slots))
  done
  echo "$total"
 }
 count_hosts() {
  local hosts="$1"
  local count=0
  local item
  IFS=' ' read -r -a host_items <<< "$hosts"
  for item in "${host_items[@]}"; do
    [[ -n "$item" ]] && count=$((count + 1))
  done
  echo "$count"
 }
 wait_for_dask_workers() {
  [[ "$DASK_WAIT_FOR_WORKERS" == "1" ]] || return 0
  local expected="$DASK_EXPECTED_WORKERS"
  if [[ -z "$expected" && -n "$WORKER_HOSTS" ]]; then
    expected=$(( $(count_hosts "$WORKER_HOSTS") * NWORKERS ))
  fi
  if [[ -z "$expected" || "$expected" -le 0 ]]; then
    return 0
  fi
  echo "Waiting for Dask workers: expected=$expected timeout=${DASK_WAIT_TIMEOUT}s"
  "$PYTHON_BIN" - "$DASK_ADDRESS" "$expected" "$DASK_WAIT_TIMEOUT" <<'PY'
 import sys
 import time
 from distributed import Client
 address, expected, timeout = sys.argv[1], int(sys.argv[2]), int(sys.argv[3])
 deadline = time.time() + timeout
 client = Client(address)
 try:
    while True:
        info = client.scheduler_info(n_workers=-1)
        workers = info.get("workers", {})
        count = len(workers)
        if count >= expected:
            print(f"dask_workers_ready count={count} expected={expected}", flush=True)
            break
        if time.time() >= deadline:
            print(
                f"dask_workers_wait_timeout count={count} expected={expected}",
                flush=True,
            )
            break
        time.sleep(2)
 finally:
    client.close()
 PY
 }
 append_mpi_env_args() {
  [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
  mpi_prefix+=(
    -x "LD_PRELOAD=${LD_PRELOAD:-}"
    -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
    -x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
    -x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
    -x "OMP_PROC_BIND=$OMP_PROC_BIND"
    -x "OMP_PLACES=$OMP_PLACES"
  )
 }
 build_mpi_prefix() {
  if [[ -n "$MPIEXEC_FULL" ]]; then
    # shellcheck disable=SC2206
    mpi_prefix=($MPIEXEC_FULL)
    append_mpi_env_args
    return
  fi
  local ranks="$MPI_RANKS"
  if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
    ranks="$(sum_host_slots "$MPI_HOSTS")"
  fi
  if [[ -z "$ranks" ]]; then
    ranks=2
  fi
  mpi_prefix=(
    "$MPIEXEC"
    --map-by "$MPI_MAP_BY"
    --bind-to "$MPI_BIND_TO"
    -np "$ranks"
  )
  if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
    mpi_prefix+=(--report-bindings)
  fi
  append_mpi_env_args
  if [[ -n "$MPI_HOSTS" ]]; then
    mpi_prefix+=(-host "$MPI_HOSTS")
  elif [[ -n "$MPI_HOSTFILE" ]]; then
    mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
  fi
 }
 is_local_host() {
  local host="$1"
  [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
  [[ "$host" == "$(hostname)" ]] && return 0
  [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
  hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
 }
 sync_trees_to_hosts() {
  [[ "$SYNC_TREES" == "1" ]] || return 0
  [[ -n "$SYNC_HOSTS" ]] || return 0
  local src_dir="$TREE_DIR"
  local dst_dir="$TREE_DIR"
  if [[ "$TREE_DIR" != /* ]]; then
    src_dir="$ROOT_DIR/$TREE_DIR"
    dst_dir="$ROOT_DIR/$TREE_DIR"
  fi
  for host in $SYNC_HOSTS; do
    is_local_host "$host" && continue
    echo "Sync tree dir to $host:$dst_dir"
    "$SSH_BIN" "$host" "mkdir -p $(printf '%q' "$dst_dir")"
    if command -v rsync >/dev/null 2>&1; then
      rsync -a "$src_dir/" "$host:$dst_dir/"
    else
      scp -q "$src_dir"/*.pkl "$host:$dst_dir/"
    fi
  done
 }
 tools/manage_tn_dask_cluster.sh start
 DASK_CLUSTER_MANAGED=1
 wait_for_dask_workers
 echo "Search with dask: $DASK_ADDRESS"
 search_args=(
  --case "$CASE"
  --nqubits "$NQUBITS"
  --nlayers "$NLAYERS"
  --observables $OBSERVABLES
  --tree-dir "$TREE_DIR"
  --dask-address "$DASK_ADDRESS"
  --torch-threads "$TORCH_THREADS"
  --dtype "$DTYPE"
  --tn-search-repeats "$SEARCH_REPEATS"
  --tn-search-time "$SEARCH_TIME"
  "${tn_slice_args[@]}"
 )
 if [[ -n "$DASK_EXPECTED_WORKERS" ]]; then
  search_args+=(--dask-expected-workers "$DASK_EXPECTED_WORKERS")
 fi
 if [[ "$TN_DEBUG_TRIALS" == "1" ]]; then
  search_args+=(--tn-debug-trials)
 fi
 "$PYTHON_BIN" -u tools/tn_contest_runner.py search "${search_args[@]}"
 sync_trees_to_hosts
 build_mpi_prefix
 echo "Contract with MPI: ${mpi_prefix[*]}"
 if [[ "$TN_CONTRACT_ENV_CHECK" == "1" ]]; then
  "${mpi_prefix[@]}" "$PYTHON_BIN" -c "from mpi4py import MPI; import os; \
 import torch; \
 rank = MPI.COMM_WORLD.Get_rank(); \
 blis = []; \
 [blis.append(line.strip().split()[-1]) for line in open('/proc/self/maps') if 'libblis' in line and line.strip().split()[-1] not in blis]; \
 print('tn_contract_env ' + \
      f'rank={rank} ' + \
      f'LD_PRELOAD={os.environ.get(\"LD_PRELOAD\", \"\")} ' + \
      f'BLIS_NUM_THREADS={os.environ.get(\"BLIS_NUM_THREADS\", \"\")} ' + \
      f'OMP_NUM_THREADS={os.environ.get(\"OMP_NUM_THREADS\", \"\")} ' + \
      f'MKL_NUM_THREADS={os.environ.get(\"MKL_NUM_THREADS\", \"\")} ' + \
      f'OMP_PROC_BIND={os.environ.get(\"OMP_PROC_BIND\", \"\")} ' + \
      f'OMP_PLACES={os.environ.get(\"OMP_PLACES\", \"\")} ' + \
      f'torch_threads={torch.get_num_threads()} ' + \
      f'blis={\";\".join(blis) if blis else \"missing\"}', flush=True)"
 fi
 "${mpi_prefix[@]}" "$PYTHON_BIN" -u tools/tn_contest_runner.py contract \
  --mpi \
  --case "$CASE" \
  --nqubits "$NQUBITS" \
  --nlayers "$NLAYERS" \
  --observables $OBSERVABLES \
  --tree-dir "$TREE_DIR" \
  --torch-threads "$TORCH_THREADS" \
  --dtype "$DTYPE" \
  "${tn_slice_args[@]}"
--- a/tools/run_vidal_mpi_contest_cases.sh
+++ b/tools/run_vidal_mpi_contest_cases.sh
@@ -1,414 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 # Contest-style Vidal/MPI MPS cases.
 #
 # Usage:
 #   tools/run_vidal_mpi_contest_cases.sh main1
 #   tools/run_vidal_mpi_contest_cases.sh main2
 #   tools/run_vidal_mpi_contest_cases.sh strong
 #   tools/run_vidal_mpi_contest_cases.sh all
 #
 # Common overrides:
 #   PYTHON_BIN=.venv/bin/python
 #   MPIEXEC=mpirun
 #   MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
 #   MPI_RANKS=8
 #   MPI_PE=128
 #   MPI_MAP_BY=ppr:1:numa:PE=128
 #   MPI_BIND_TO=core
 #   MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
 #   HOSTFILE=hostfile                 # optional; used only if the file exists
 #   RANKS=8                           # fallback if MPI_RANKS is not set
 #   TORCH_THREADS=8
 #   CUT_RATIO=1e-12
 #   OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
 #
 # Per-case overrides:
 #   MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
 #   MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
 #   STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT_DIR"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 MPIEXEC="${MPIEXEC:-mpirun}"
 MPIEXEC_FULL="${MPIEXEC_FULL:-}"
 MPI_HOSTS="${MPI_HOSTS:-}"
 MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
 MPI_RANKS="${MPI_RANKS:-${RANKS:-}}"
 RANKS="${RANKS:-4}"
 TORCH_THREADS="${TORCH_THREADS:-1}"
 MPI_PE="${MPI_PE:-$TORCH_THREADS}"
 MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
 MPI_BIND_TO="${MPI_BIND_TO:-core}"
 MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
 MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
 CUT_RATIO="${CUT_RATIO:-1e-12}"
 OBS_FILTER="${OBS_FILTER:-}"
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
 source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 RUNNER_DIR="$ROOT_DIR/.tmp"
 mkdir -p "$RUNNER_DIR"
 RUNNER="$(mktemp "$RUNNER_DIR/qibotn_vidal_contest.XXXXXX.py")"
 cleanup() {
  rm -f "$RUNNER"
 }
 trap cleanup EXIT
 cat > "$RUNNER" <<'PY'
 from __future__ import annotations
 import argparse
 import math
 import time
 import numpy as np
 from mpi4py import MPI
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Y, Z
 from qibotn.backends.vidal import VidalBackend
 def set_torch_threads(nthreads):
    try:
        import torch
        torch.set_num_threads(nthreads)
    except Exception:
        pass
 def build_circuit(kind, nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        for q in range(nqubits):
            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
            if kind in ("rxx_rzz", "scramble"):
                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
        if kind == "reversed_cnot":
            for q in range(0, nqubits - 1, 2):
                circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
            for q in range(1, nqubits - 1, 2):
                circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
        elif kind == "rxx_rzz":
            for q in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
        elif kind == "scramble":
            for q in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
                if layer % 5 == 4:
                    circuit.add(gates.SWAP(q, q + 1))
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
    return circuit
 def ring_xz(nqubits):
    form = 0
    for q in range(nqubits):
        form += 0.5 * X(q) * Z((q + 1) % nqubits)
    return hamiltonians.SymbolicHamiltonian(form=form)
 def open_zz(nqubits):
    form = 0
    for q in range(nqubits - 1):
        form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
    return hamiltonians.SymbolicHamiltonian(form=form)
 def range2_xx(nqubits):
    form = 0
    for q in range(nqubits - 2):
        form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
    return hamiltonians.SymbolicHamiltonian(form=form)
 def dense_observable(nqubits, qubits, seed, dim):
    rng = np.random.default_rng(seed)
    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
    matrix = (raw + raw.conj().T) / 2.0
    matrix = matrix / np.linalg.norm(matrix)
    return {"matrix": matrix, "qubits": list(qubits)}
 def observables_for_case(nqubits, seed):
    q1 = nqubits // 4
    q2 = nqubits // 2
    q3 = (3 * nqubits) // 4
    last = nqubits - 1
    return [
        ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
        ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
        ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
        (
            "long_Z_5_sites",
            hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
        ),
        (
            "mixed_XZYZX",
            hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
        ),
        ("ring_xz", ring_xz(nqubits)),
        ("open_zz", open_zz(nqubits)),
        ("range2_xx", range2_xx(nqubits)),
        ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
        ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
        ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
    ]
 def run_case(args):
    set_torch_threads(args.torch_threads)
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
    observables = observables_for_case(args.nqubits, args.seed)
    if args.obs_filter:
        wanted = set(args.obs_filter.split(","))
        observables = [(name, obs) for name, obs in observables if name in wanted]
        if not observables:
            raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
    if rank == 0:
        print("=" * 88, flush=True)
        print(
            "case "
            f"label={args.label} kind={args.kind} ranks={size} "
            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
            f"bond={args.bond} cut_ratio={args.cut_ratio:g} "
            f"torch_threads={args.torch_threads} seed={args.seed} "
            f"obs_filter={args.obs_filter or 'all'}",
            flush=True,
        )
        print(
            "observable value seconds trunc_sum trunc_max status",
            flush=True,
        )
    for obs_name, observable in observables:
        backend = VidalBackend()
        backend.configure_tn_simulation(
            max_bond_dimension=args.bond,
            cut_ratio=args.cut_ratio,
            tensor_module="torch",
            mpi_approach="CT",
            mpi_num_procs=size,
            fallback=False,
        )
        comm.Barrier()
        start = time.perf_counter()
        try:
            value = backend.expectation(
                circuit,
                observable,
                preprocess=True,
                compile_circuit=False,
            )
            status = "ok"
        except Exception as exc:  # pragma: no cover - printed for manual runs
            value = np.nan
            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
        seconds = time.perf_counter() - start
        if rank == 0:
            print(
                f"{obs_name} {value!r} {seconds:.3f} "
                f"{backend.last_truncation_error:.6e} "
                f"{backend.last_max_truncation_error:.6e} {status}",
                flush=True,
            )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--label", required=True)
    parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
    parser.add_argument("--nqubits", type=int, required=True)
    parser.add_argument("--nlayers", type=int, required=True)
    parser.add_argument("--bond", type=int, required=True)
    parser.add_argument("--cut-ratio", type=float, required=True)
    parser.add_argument("--seed", type=int, required=True)
    parser.add_argument("--torch-threads", type=int, required=True)
    parser.add_argument("--obs-filter", default="")
    run_case(parser.parse_args())
 if __name__ == "__main__":
    main()
 PY
 sum_host_slots() {
  local hosts="$1"
  local total=0
  local item slots
  IFS=',' read -r -a host_items <<< "$hosts"
  for item in "${host_items[@]}"; do
    if [[ "$item" == *:* ]]; then
      slots="${item##*:}"
    else
      slots=1
    fi
    total=$((total + slots))
  done
  echo "$total"
 }
 append_mpi_env_args() {
  [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
  mpi_prefix+=(
    -x "LD_PRELOAD=${LD_PRELOAD:-}"
    -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
    -x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
    -x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
    -x "OMP_PROC_BIND=$OMP_PROC_BIND"
    -x "OMP_PLACES=$OMP_PLACES"
  )
 }
 build_mpi_prefix() {
  if [[ -n "$MPIEXEC_FULL" ]]; then
    # shellcheck disable=SC2206
    mpi_prefix=($MPIEXEC_FULL)
    append_mpi_env_args
    return
  fi
  local ranks="$MPI_RANKS"
  if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
    ranks="$(sum_host_slots "$MPI_HOSTS")"
  fi
  if [[ -z "$ranks" ]]; then
    ranks="$RANKS"
  fi
  mpi_prefix=(
    "$MPIEXEC"
    --map-by "$MPI_MAP_BY"
    --bind-to "$MPI_BIND_TO"
    -np "$ranks"
  )
  if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
    mpi_prefix+=(--report-bindings)
  fi
  append_mpi_env_args
  if [[ -n "$MPI_HOSTS" ]]; then
    mpi_prefix+=(-host "$MPI_HOSTS")
  elif [[ -n "$MPI_HOSTFILE" ]]; then
    mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
  fi
 }
 build_mpi_prefix
 run_case() {
  local label="$1"
  local kind="$2"
  local nq="$3"
  local layers="$4"
  local bond="$5"
  local seed="$6"
  echo
  echo "Running $label: kind=$kind nqubits=$nq layers=$layers bond=$bond seed=$seed"
  echo "MPI: ${mpi_prefix[*]}"
  "${mpi_prefix[@]}" "$PYTHON_BIN" -u "$ROOT_DIR/tools/vidal_mpi_contest_runner.py" \
    --label "$label" \
    --kind "$kind" \
    --nqubits "$nq" \
    --nlayers "$layers" \
    --bond "$bond" \
    --cut-ratio "$CUT_RATIO" \
    --seed "$seed" \
    --torch-threads "$TORCH_THREADS" \
    --obs-filter "$(tr ' ' ',' <<< "$OBS_FILTER")"
 }
 case "${1:-help}" in
  main1)
    run_case \
      "main1-reversed-cnot" \
      "reversed_cnot" \
      "${MAIN1_NQ:-128}" \
      "${MAIN1_LAYERS:-50}" \
      "${MAIN1_BOND:-1024}" \
      "${MAIN1_SEED:-31001}"
    ;;
  main2)
    run_case \
      "main2-rxx-rzz" \
      "rxx_rzz" \
      "${MAIN2_NQ:-128}" \
      "${MAIN2_LAYERS:-64}" \
      "${MAIN2_BOND:-2048}" \
      "${MAIN2_SEED:-31002}"
    ;;
  strong)
    run_case \
      "strong-scramble" \
      "scramble" \
      "${STRONG_NQ:-256}" \
      "${STRONG_LAYERS:-64}" \
      "${STRONG_BOND:-2048}" \
      "${STRONG_SEED:-41001}"
    ;;
  all)
    "$0" main1
    "$0" main2
    "$0" strong
    ;;
  smoke)
    MAIN1_NQ="${MAIN1_NQ:-32}" \
    MAIN1_LAYERS="${MAIN1_LAYERS:-6}" \
    MAIN1_BOND="${MAIN1_BOND:-128}" \
    "$0" main1
    ;;
  help|*)
    cat >&2 <<'EOF'
 Usage: tools/run_vidal_mpi_contest_cases.sh [main1|main2|strong|all|smoke]
 Cases:
  main1   128 qubits, 50 layers, reversed-CNOT brickwall, chi=1024
  main2   128 qubits, 64 layers, RXX/RZZ brickwall, chi=2048
  strong  256 qubits, 64 layers, RXX/RZZ + periodic SWAP scramble, chi=2048
  smoke   Small syntax/runtime check of main1
 Common overrides:
  PYTHON_BIN=.venv/bin/python
  MPIEXEC=mpiexec
  MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
  MPI_RANKS=8
  MPI_PE=128
  MPI_MAP_BY=ppr:1:numa:PE=128
  MPI_BIND_TO=core
  MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
  HOSTFILE=hostfile
  RANKS=8
  TORCH_THREADS=8
  CUT_RATIO=1e-12
  OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
 Per-case overrides:
  MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
  MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
  STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
 EOF
    exit 2
    ;;
 esac
--- a/tools/run_vidal_segment_mpi_scan.sh
+++ b/tools/run_vidal_segment_mpi_scan.sh
@@ -1,70 +0,0 @@
 #!/usr/bin/env bash
 set -euo pipefail
 NQ="${NQ:-34}"
 LAYERS="${LAYERS:-20}"
 BOND="${BOND:-512}"
 SEED="${SEED:-42}"
 RANKS="${RANKS:-1 2 4}"
 THREADS="${THREADS:-32 32 16}"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 MPIEXEC="${MPIEXEC:-mpiexec}"
 CIRCUIT="${CIRCUIT:-brickwall_cnot}"
 OBSERVABLE="${OBSERVABLE:-ring_xz}"
 EXACT="${EXACT:-0}"
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT_DIR"
 if [[ "${1:-help}" != "run" ]]; then
  cat >&2 <<'EOF'
 Usage: tools/run_vidal_segment_mpi_scan.sh run
 Overrides:
  NQ=34 LAYERS=20 BOND=512 SEED=42
  RANKS="1 2 4" THREADS="32 32 16"
  CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz
  EXACT=1
  PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec
 EOF
  if [[ "${1:-help}" == "help" ]]; then
    exit 0
  fi
  exit 2
 fi
 read -r -a ranks <<< "$RANKS"
 read -r -a threads <<< "$THREADS"
 if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then
  echo "RANKS and THREADS must have the same number of entries." >&2
  exit 2
 fi
 common=(
  --nqubits "$NQ"
  --nlayers "$LAYERS"
  --bond "$BOND"
  --seed "$SEED"
  --mps
  --circuits "$CIRCUIT"
  --observables "$OBSERVABLE"
 )
 if [[ "$EXACT" == "1" ]]; then
  common+=(--exact)
 fi
 for idx in "${!ranks[@]}"; do
  nrank="${ranks[$idx]}"
  nthr="${threads[$idx]}"
  if [[ "$nrank" == "1" ]]; then
    echo "== Vidal serial ranks=1 torch_threads=$nthr =="
    "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      "${common[@]}" --torch-threads "$nthr"
  else
    echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr =="
    "$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
      "${common[@]}" --torch-threads "$nthr" --mpi
  fi
 done
--- a/tools/slice_existing_tree.py
+++ b/tools/slice_existing_tree.py
@@ -1,59 +0,0 @@
 """Slice an existing saved cotengra tree without re-running path search."""
 from __future__ import annotations
 import argparse
 import pickle
 from pathlib import Path
 from qibotn.parallel import contraction_tree_costs
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="Input pickle saved by --tn-save-tree.")
    parser.add_argument("output", help="Output pickle path.")
    parser.add_argument("--term", type=int, default=0)
    parser.add_argument("--target-slices", type=int, default=2)
    parser.add_argument("--max-repeats", type=int, default=64)
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()
    input_path = Path(args.input)
    output_path = Path(args.output)
    with input_path.open("rb") as f:
        payload = pickle.load(f)
    trees = payload["trees"] if isinstance(payload, dict) else payload
    if not isinstance(trees, (list, tuple)):
        trees = [trees]
    tree = trees[args.term]
    print("original", contraction_tree_costs(tree), flush=True)
    sliced = tree.slice(
        target_slices=args.target_slices,
        max_repeats=args.max_repeats,
        seed=args.seed,
    )
    print("sliced", contraction_tree_costs(sliced), flush=True)
    print(f"sliced_inds={sliced.sliced_inds}", flush=True)
    new_trees = list(trees)
    new_trees[args.term] = sliced
    if isinstance(payload, dict):
        out_payload = dict(payload)
        out_payload["trees"] = new_trees
        out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees]
        out_payload["nterms"] = len(new_trees)
    else:
        out_payload = new_trees
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with output_path.open("wb") as f:
        pickle.dump(out_payload, f)
    print(f"saved {output_path}", flush=True)
 if __name__ == "__main__":
    main()
--- a/tools/tn_contest_runner.py
+++ b/tools/tn_contest_runner.py
@@ -1,443 +0,0 @@
 #!/usr/bin/env python
 """Contest-style CPU TN path search and contraction runner.
 This file is intentionally self-contained: define contest circuits and
 observables here, run path search once, then load the saved trees for repeated
 MPI contractions.
 """
 from __future__ import annotations
 import argparse
 import math
 import os
 import subprocess
 import sys
 from dataclasses import dataclass
 from pathlib import Path
 from urllib.parse import urlparse
 import numpy as np
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Y, Z
 ROOT = Path(__file__).resolve().parents[1]
 SRC = ROOT / "src"
 if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
 from qibotn.expectation_runner import (  # noqa: E402
    ExpectationConfig,
    exact_for_observable,
    run_cpu_expectation,
 )
@dataclass(frozen=True)
 class CaseSpec:
    circuit_kind: str
    observables: tuple[str, ...]
    nqubits: int
    nlayers: int
    seed: int
    target_slices: int | None = None
 CASES = {
    "main1": CaseSpec(
        circuit_kind="rxx_rzz_chain",
        observables=("ring_xz",),
        nqubits=37,
        nlayers=20,
        seed=31001,
        target_slices=None,
    ),
    "main2": CaseSpec(
        circuit_kind="scramble_chain",
        observables=("open_zz", "range2_xx"),
        nqubits=36,
        nlayers=18,
        seed=31002,
        target_slices=None,
    ),
    "strong": CaseSpec(
        circuit_kind="reversed_cnot",
        observables=("ring_xz", "long_z_string"),
        nqubits=40,
        nlayers=24,
        seed=41001,
        target_slices=None,
    ),
 }
 def optional_int(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return int(text)
 def optional_float(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return float(text)
 def set_torch_threads(nthreads):
    try:
        import torch
        torch.set_num_threads(nthreads)
    except Exception:
        pass
 def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
    for qubit in range(nqubits):
        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
        if include_rx:
            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
 def build_circuit(kind, nqubits, nlayers, seed):
    """Define contest circuits here."""
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        if kind == "rxx_rzz_chain":
            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
        elif kind == "scramble_chain":
            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
            for qubit in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
                if layer % 5 == 4:
                    circuit.add(gates.SWAP(qubit, qubit + 1))
        elif kind == "reversed_cnot":
            add_single_qubit_layer(circuit, nqubits, rng)
            for qubit in range(0, nqubits - 1, 2):
                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
                circuit.add(gate)
            for qubit in range(1, nqubits - 1, 2):
                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
                circuit.add(gate)
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
    return circuit
 def pauli_sum_observable(kind, nqubits, seed):
    """Define contest observables here.
    TN path currently expects Pauli products / SymbolicHamiltonian terms.
    Keep production contest observables Hermitian unless complex output is
    explicitly required by the scoring rule.
    """
    del seed
    if kind == "ring_xz":
        form = 0
        for qubit in range(nqubits):
            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "open_zz":
        form = 0
        for qubit in range(nqubits - 1):
            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "range2_xx":
        form = 0
        for qubit in range(nqubits - 2):
            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "long_z_string":
        stride = max(1, nqubits // 16)
        form = None
        for qubit in range(0, nqubits, stride):
            form = Z(qubit) if form is None else form * Z(qubit)
        return hamiltonians.SymbolicHamiltonian(form=form)
    if kind == "mixed_local":
        q1 = nqubits // 4
        q2 = nqubits // 2
        q3 = (3 * nqubits) // 4
        form = 0.25 * X(0) - 0.5 * Z(nqubits - 1)
        form += 0.125 * X(q1) * Z(q2) * Y(q3)
        return hamiltonians.SymbolicHamiltonian(form=form)
    raise ValueError(f"Unknown observable kind {kind!r}.")
 def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices):
    slice_label = "auto" if target_slices is None else f"s{target_slices}"
    return (
        Path(tree_dir)
        / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}.pkl"
    )
 def build_parallel_opts(args, tree_file=None, search_only=False):
    slicing_opts = {}
    if args.tn_target_slices is not None:
        slicing_opts["target_slices"] = args.tn_target_slices
    if args.tn_target_size is not None:
        slicing_opts["target_size"] = args.tn_target_size
    opts = {
        "slicing_opts": slicing_opts or None,
        "search_workers": args.tn_search_workers or args.torch_threads,
        "max_repeats": args.tn_search_repeats,
        "max_time": args.tn_search_time,
        "print_stats": False,
    }
    if args.tn_search_backend is not None:
        opts["search_backend"] = args.tn_search_backend
    if args.dask_address is not None:
        opts["dask_address"] = args.dask_address
    if args.dask_expected_workers is not None:
        opts["dask_expected_workers"] = args.dask_expected_workers
    if args.dask_close_workers:
        opts["dask_close_workers"] = True
    if args.tn_debug_trials:
        opts["debug_trials"] = True
    if search_only:
        opts["search_only"] = True
        opts["save_tree_path"] = str(tree_file)
    elif tree_file is not None:
        opts["load_tree_path"] = str(tree_file)
    return opts
 def run_one(args, case_name, obs_name, mode):
    case = CASES[case_name]
    circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
    observable = pauli_sum_observable(obs_name, args.nqubits, args.seed)
    path = tree_path(
        args.tree_dir,
        case_name,
        obs_name,
        args.nqubits,
        args.nlayers,
        args.tn_target_slices,
    )
    path.parent.mkdir(parents=True, exist_ok=True)
    rank = 0
    if args.mpi:
        from mpi4py import MPI
        rank = MPI.COMM_WORLD.Get_rank()
    if rank == 0:
        print("=" * 88, flush=True)
        print(
            f"mode={mode} case={case_name} circuit={case.circuit_kind} "
            f"observable={obs_name} nqubits={args.nqubits} nlayers={args.nlayers} "
            f"seed={args.seed} gates={len(circuit.queue)} tree={path}",
            flush=True,
        )
    if mode == "contract" and not path.exists():
        raise FileNotFoundError(f"Missing tree file: {path}. Run search first.")
    exact = None
    if args.exact and rank == 0 and mode != "search":
        if args.nqubits > args.exact_max_qubits:
            raise ValueError(
                f"--exact is limited to {args.exact_max_qubits} qubits by default."
            )
        exact = exact_for_observable(circuit, observable, args.nqubits)
    config = ExpectationConfig(
        ansatz="tn",
        mpi=args.mpi,
        bond=args.bond,
        cut_ratio=args.cut_ratio,
        tensor_module="torch",
        quimb_backend=args.quimb_backend,
        dtype=args.dtype,
        torch_threads=args.torch_threads,
        parallel_opts=build_parallel_opts(
            args,
            tree_file=path,
            search_only=(mode == "search"),
        ),
    )
    result = run_cpu_expectation(circuit, observable, config)
    if args.mpi and result.rank != 0:
        return
    if mode == "search":
        print(f"searched observable={obs_name} tree={path}", flush=True)
    else:
        abs_error = float("nan") if exact is None else abs(result.value - exact)
        rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
        exact_text = "nan" if exact is None else f"{exact:.16e}"
        print(
            f"result observable={obs_name} exact={exact_text} "
            f"value={result.value:.16e} abs_error={abs_error:.6e} "
            f"rel_error={rel_error:.6e} seconds={result.seconds:.3f}",
            flush=True,
        )
    for stat in result.parallel_stats or ():
        cost = stat["path_cost"]
        search_stats = stat.get("search_stats", {})
        print(
            "tn_term_summary "
            f"observable={obs_name} "
            f"term={stat.get('term_index', 0)} "
            f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
            f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
            f"completed_trials={search_stats.get('completed_trials', 'na')} "
            f"finite_trials={search_stats.get('finite_trials', 'na')} "
            f"failed_trials={search_stats.get('failed_trials', 'na')} "
            f"requested_trials={search_stats.get('requested_trials', 'na')} "
            f"best_score={search_stats.get('best_score', float('nan')):.6g} "
            f"slices={cost.get('nslices')} "
            f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
            f"log10_write={cost.get('log10_write', float('nan')):.3f} "
            f"log2_size={cost.get('log2_size', float('nan')):.3f} "
            f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
            f"rank_slices={stat.get('rank_slices')}",
            flush=True,
        )
 def selected_observables(args, case):
    if args.observables:
        return tuple(args.observables)
    if args.obs_filter:
        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
    return case.observables
 def apply_case_defaults(args):
    case = CASES[args.case]
    if args.nqubits is None:
        args.nqubits = case.nqubits
    if args.nlayers is None:
        args.nlayers = case.nlayers
    if args.seed is None:
        args.seed = case.seed
    if args.tn_target_slices is None:
        args.tn_target_slices = case.target_slices
    args.observables = selected_observables(args, case)
 def stop_dask_cluster(args):
    if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address:
        return
    if args.mpi:
        from mpi4py import MPI
        if MPI.COMM_WORLD.Get_rank() != 0:
            return
    script = ROOT / "tools" / "manage_tn_dask_cluster.sh"
    if not script.exists():
        print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
        return
    env = os.environ.copy()
    parsed = urlparse(args.dask_address)
    if parsed.hostname:
        env.setdefault("SCHEDULER_HOST", parsed.hostname)
    if parsed.port:
        env.setdefault("SCHEDULER_PORT", str(parsed.port))
    print("dask_stop_after_search start", flush=True)
    subprocess.run([str(script), "stop"], cwd=str(ROOT), env=env, check=False)
    print("dask_stop_after_search done", flush=True)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("mode", choices=("search", "contract", "all", "validate", "list"))
    parser.add_argument("--case", choices=sorted(CASES), default="main1")
    parser.add_argument("--observables", nargs="+")
    parser.add_argument("--obs-filter", default="")
    parser.add_argument("--tree-dir", default="trees/contest_tn")
    parser.add_argument("--nqubits", type=int)
    parser.add_argument("--nlayers", type=int)
    parser.add_argument("--seed", type=int)
    parser.add_argument("--mpi", action="store_true")
    parser.add_argument("--exact", action="store_true")
    parser.add_argument("--exact-max-qubits", type=int, default=24)
    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
    parser.add_argument("--torch-threads", type=int, default=8)
    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
    parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64")
    parser.add_argument("--tn-target-slices", type=int)
    parser.add_argument("--tn-target-size", type=int, default=2**34)
    parser.add_argument("--tn-search-workers", type=int)
    parser.add_argument("--tn-search-repeats", type=int, default=2048)
    parser.add_argument("--tn-search-time", type=float, default=300.0)
    parser.add_argument(
        "--tn-search-backend",
        choices=("processpool", "dask"),
        default="dask",
        help=(
            "Path-search backend. Defaults to dask. Without --dask-address, "
            "non-MPI search starts a local dask cluster."
        ),
    )
    parser.add_argument("--dask-address")
    parser.add_argument("--dask-expected-workers", type=int)
    parser.add_argument("--dask-close-workers", action="store_true")
    parser.add_argument(
        "--keep-dask",
        action="store_true",
        help=(
            "Keep an external dask cluster running after search. By default, "
            "tools/manage_tn_dask_cluster.sh stop is called after search when "
            "--dask-address is used."
        ),
    )
    parser.add_argument(
        "--tn-debug-trials",
        action="store_true",
        help="Print dask worker summary and per-trial start/done logs.",
    )
    parser.add_argument("--no-tn-stats", action="store_true")
    args = parser.parse_args()
    if args.mode == "list":
        for name, case in CASES.items():
            print(
                f"{name}: circuit={case.circuit_kind} "
                f"observables={','.join(case.observables)} "
                f"nqubits={case.nqubits} nlayers={case.nlayers} "
                f"seed={case.seed} target_slices={case.target_slices}"
            )
        return
    apply_case_defaults(args)
    set_torch_threads(args.torch_threads)
    modes = ("search", "contract") if args.mode == "all" else (args.mode,)
    if args.mode == "validate":
        args.exact = True
        args.nqubits = min(args.nqubits, args.exact_max_qubits)
        modes = ("search", "contract")
    for mode in modes:
        for obs_name in args.observables:
            run_one(args, args.case, obs_name, mode)
        if mode == "search":
            stop_dask_cluster(args)
 if __name__ == "__main__":
    main()
--- a/tools/torch_profile_tn_complex64.py
+++ b/tools/torch_profile_tn_complex64.py
@@ -1,114 +0,0 @@
 """Run the 34q/20L TN complex64 benchmark under torch.profiler briefly."""
 from __future__ import annotations
 import argparse
 import os
 import signal
 import sys
 from pathlib import Path
 from mpi4py import MPI
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--seconds", type=float, default=30.0)
    parser.add_argument("--out-dir", default="torch_profiles/tn_complex64")
    parser.add_argument("--torch-threads", type=int, default=48)
    args = parser.parse_args()
    repo_root = Path(__file__).resolve().parents[1]
    os.chdir(repo_root)
    sys.path.insert(0, str(repo_root))
    import torch
    from torch.profiler import ProfilerActivity, profile
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    out_dir = Path(args.out_dir)
    if rank == 0:
        out_dir.mkdir(parents=True, exist_ok=True)
    comm.Barrier()
    torch.set_num_threads(args.torch_threads)
    def run_benchmark():
        import benchmark_cpu_expectation
        sys.argv = [
            "benchmark_cpu_expectation.py",
            "--mpi",
            "--ansatz",
            "tn",
            "--nqubits",
            "34",
            "--nlayers",
            "20",
            "--circuits",
            "rxx_rzz",
            "--pauli-pattern",
            "XZ",
            "--tn-load-tree",
            "trees/rxx_rzz_34q20l_s4.pkl",
            "--quimb-backend",
            "torch",
            "--torch-threads",
            str(args.torch_threads),
            "--dtype",
            "complex64",
        ]
        benchmark_cpu_expectation.main()
    trace_path = out_dir / f"rank{rank}_trace.json"
    stacks_path = out_dir / f"rank{rank}_stacks.txt"
    summary_path = out_dir / f"rank{rank}_summary.txt"
    prof = profile(
        activities=[ProfilerActivity.CPU],
        record_shapes=True,
        profile_memory=True,
        with_stack=True,
    )
    class ProfileTimeout(Exception):
        pass
    def alarm_handler(signum, frame):
        raise ProfileTimeout()
    old_handler = signal.signal(signal.SIGALRM, alarm_handler)
    signal.setitimer(signal.ITIMER_REAL, args.seconds)
    try:
        with prof:
            try:
                run_benchmark()
            except ProfileTimeout:
                pass
    finally:
        signal.setitimer(signal.ITIMER_REAL, 0)
        signal.signal(signal.SIGALRM, old_handler)
    prof.export_chrome_trace(str(trace_path))
    try:
        prof.export_stacks(str(stacks_path), "self_cpu_time_total")
    except Exception as exc:  # pragma: no cover - diagnostic only
        stacks_path.write_text(f"export_stacks failed: {exc}\n", encoding="utf-8")
    summary = prof.key_averages(group_by_stack_n=5).table(
        sort_by="self_cpu_time_total",
        row_limit=40,
    )
    summary_path.write_text(summary, encoding="utf-8")
    print(
        f"torch_profile_done rank={rank}/{size} "
        f"trace={trace_path} summary={summary_path}",
        flush=True,
    )
 if __name__ == "__main__":
    main()
--- a/tools/validate_vidal_mpi_correctness.py
+++ b/tools/validate_vidal_mpi_correctness.py
@@ -1,202 +0,0 @@
 """Correctness checks for the Vidal/TEBD MPS fast path.
 The cases here intentionally cover more than the benchmark ring-XZ observable:
 different nearest-neighbor gate orientations and several Pauli-sum observables.
 Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare
 the segmented Vidal executor.
 """
 from __future__ import annotations
 import argparse
 import math
 import time
 import numpy as np
 import torch
 from qibo import Circuit, gates
 from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
 from qibotn.backends.vidal_tebd import VidalTEBDExecutor
 def build_circuit(kind, nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        for q in range(nqubits):
            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
            if kind == "rx_ry_cz":
                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
        if kind in ("brickwall", "reversed_cnot"):
            for q in range(0, nqubits - 1, 2):
                if kind == "reversed_cnot" and (layer % 2):
                    circuit.add(gates.CNOT(q + 1, q))
                else:
                    circuit.add(gates.CNOT(q, q + 1))
            for q in range(1, nqubits - 1, 2):
                if kind == "reversed_cnot" and not (layer % 2):
                    circuit.add(gates.CNOT(q + 1, q))
                else:
                    circuit.add(gates.CNOT(q, q + 1))
        elif kind == "rx_ry_cz":
            for q in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.CZ(q, q + 1))
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
    return circuit
 def observable_terms(kind, nqubits):
    if kind == "ring_xz":
        return [
            (0.5, (("X", site), ("Z", (site + 1) % nqubits)))
            for site in range(nqubits)
        ]
    if kind == "open_zz":
        return [
            (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
            for site in range(nqubits - 1)
        ]
    if kind == "mixed_local":
        terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
        terms += [
            (0.125, (("Y", site), ("Y", site + 1)))
            for site in range(0, nqubits - 1, 3)
        ]
        return terms
    raise ValueError(f"Unknown observable kind {kind!r}.")
 def exact_pauli_sum(circuit, terms, nqubits):
    state = circuit().state(numpy=True).reshape(-1)
    indices = np.arange(state.size, dtype=np.int64)
    value = 0.0 + 0.0j
    for coeff, ops in terms:
        flipped = indices.copy()
        phase = np.ones(state.size, dtype=np.complex128)
        for name, site in ops:
            shift = nqubits - 1 - site
            bit = (indices >> shift) & 1
            name = name.upper()
            if name == "X":
                flipped ^= 1 << shift
            elif name == "Y":
                flipped ^= 1 << shift
                phase *= 1j * (1 - 2 * bit)
            elif name == "Z":
                phase *= 1 - 2 * bit
            elif name != "I":
                raise ValueError(f"Unsupported Pauli {name!r}.")
        value += coeff * np.vdot(state[flipped], phase * state)
    return float(value.real)
 def run_vidal(circuit, terms, nqubits, bond, tensor_module):
    executor = VidalTEBDExecutor(
        nqubits=nqubits,
        max_bond=bond,
        cut_ratio=1e-12,
        tensor_module=tensor_module,
    )
    executor.run_circuit(circuit)
    return float(executor.expectation_pauli_sum(terms))
 def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm):
    executor = SegmentVidalMPIExecutor(
        nqubits=nqubits,
        max_bond=bond,
        cut_ratio=1e-12,
        tensor_module=tensor_module,
        comm=comm,
    )
    executor.run_circuit(circuit)
    return executor.expectation_pauli_sum_root(terms)
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--nqubits", type=int, default=16)
    parser.add_argument("--nlayers", type=int, default=6)
    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
    parser.add_argument("--torch-threads", type=int, default=32)
    parser.add_argument("--mpi", action="store_true")
    parser.add_argument(
        "--circuits",
        nargs="+",
        default=("brickwall", "reversed_cnot", "rx_ry_cz"),
    )
    parser.add_argument(
        "--observables",
        nargs="+",
        default=("ring_xz", "open_zz", "mixed_local"),
    )
    args = parser.parse_args()
    torch.set_num_threads(args.torch_threads)
    comm = None
    rank = 0
    size = 1
    if args.mpi:
        from mpi4py import MPI
        comm = MPI.COMM_WORLD
        rank = comm.Get_rank()
        size = comm.Get_size()
    if rank == 0:
        mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal"
        print(
            f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} "
            f"bond={args.bond} tensor_module={args.tensor_module}"
        )
        print("circuit observable exact value abs_error seconds")
    for circuit_kind in args.circuits:
        circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
        exact = None
        if rank == 0:
            exact_values = {
                obs: exact_pauli_sum(
                    circuit, observable_terms(obs, args.nqubits), args.nqubits
                )
                for obs in args.observables
            }
        else:
            exact_values = None
        if comm is not None:
            exact_values = comm.bcast(exact_values, root=0)
        for obs_kind in args.observables:
            terms = observable_terms(obs_kind, args.nqubits)
            start = time.perf_counter()
            if args.mpi:
                value = run_segment_mpi(
                    circuit,
                    terms,
                    args.nqubits,
                    args.bond,
                    args.tensor_module,
                    comm,
                )
            else:
                value = run_vidal(
                    circuit, terms, args.nqubits, args.bond, args.tensor_module
                )
            if rank != 0:
                continue
            elapsed = time.perf_counter() - start
            exact = exact_values[obs_kind]
            print(
                f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} "
                f"{abs(value - exact):.6e} {elapsed:.3f}"
            )
 if __name__ == "__main__":
    main()
--- a/tools/vidal_mpi_contest_runner.py
+++ b/tools/vidal_mpi_contest_runner.py
@@ -1,209 +0,0 @@
 from __future__ import annotations
 import argparse
 import math
 import time
 import numpy as np
 from mpi4py import MPI
 from qibo import Circuit, gates, hamiltonians
 from qibo.symbols import X, Y, Z
 from qibotn.backends.vidal import VidalBackend
 def optional_int(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return int(text)
 def optional_float(text):
    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
        return None
    return float(text)
 def format_optional(value, fmt="g"):
    return "None" if value is None else format(value, fmt)
 def set_torch_threads(nthreads):
    try:
        import torch
        torch.set_num_threads(nthreads)
    except Exception:
        pass
 def build_circuit(kind, nqubits, nlayers, seed):
    rng = np.random.default_rng(seed)
    circuit = Circuit(nqubits)
    for layer in range(nlayers):
        for q in range(nqubits):
            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
            if kind in ("rxx_rzz", "scramble"):
                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
        if kind == "reversed_cnot":
            for q in range(0, nqubits - 1, 2):
                circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
            for q in range(1, nqubits - 1, 2):
                circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
        elif kind == "rxx_rzz":
            for q in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
        elif kind == "scramble":
            for q in range(layer % 2, nqubits - 1, 2):
                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
                if layer % 5 == 4:
                    circuit.add(gates.SWAP(q, q + 1))
        else:
            raise ValueError(f"Unknown circuit kind {kind!r}.")
    return circuit
 def ring_xz(nqubits):
    form = 0
    for q in range(nqubits):
        form += 0.5 * X(q) * Z((q + 1) % nqubits)
    return hamiltonians.SymbolicHamiltonian(form=form)
 def open_zz(nqubits):
    form = 0
    for q in range(nqubits - 1):
        form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
    return hamiltonians.SymbolicHamiltonian(form=form)
 def range2_xx(nqubits):
    form = 0
    for q in range(nqubits - 2):
        form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
    return hamiltonians.SymbolicHamiltonian(form=form)
 def dense_observable(nqubits, qubits, seed, dim):
    rng = np.random.default_rng(seed)
    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
    matrix = (raw + raw.conj().T) / 2.0
    matrix = matrix / np.linalg.norm(matrix)
    return {"matrix": matrix, "qubits": list(qubits)}
 def observables_for_case(nqubits, seed):
    q1 = nqubits // 4
    q2 = nqubits // 2
    q3 = (3 * nqubits) // 4
    last = nqubits - 1
    return [
        ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
        ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
        ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
        (
            "long_Z_5_sites",
            hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
        ),
        (
            "mixed_XZYZX",
            hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
        ),
        ("ring_xz", ring_xz(nqubits)),
        ("open_zz", open_zz(nqubits)),
        ("range2_xx", range2_xx(nqubits)),
        ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
        ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
        ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
    ]
 def run_case(args):
    set_torch_threads(args.torch_threads)
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
    circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
    observables = observables_for_case(args.nqubits, args.seed)
    if args.obs_filter:
        wanted = set(args.obs_filter.split(","))
        observables = [(name, obs) for name, obs in observables if name in wanted]
        if not observables:
            raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
    if rank == 0:
        print("=" * 88, flush=True)
        print(
            "case "
            f"label={args.label} kind={args.kind} ranks={size} "
            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
            f"bond={format_optional(args.bond)} "
            f"cut_ratio={format_optional(args.cut_ratio)} "
            f"torch_threads={args.torch_threads} seed={args.seed} "
            f"obs_filter={args.obs_filter or 'all'}",
            flush=True,
        )
        print(
            "observable value seconds trunc_sum trunc_max status",
            flush=True,
        )
    for obs_name, observable in observables:
        backend = VidalBackend()
        backend.configure_tn_simulation(
            max_bond_dimension=args.bond,
            cut_ratio=args.cut_ratio,
            tensor_module="torch",
            mpi_approach="CT",
            mpi_num_procs=size,
            fallback=False,
        )
        comm.Barrier()
        start = time.perf_counter()
        try:
            value = backend.expectation(
                circuit,
                observable,
                preprocess=True,
                compile_circuit=False,
            )
            status = "ok"
        except Exception as exc:  # pragma: no cover - printed for manual runs
            value = np.nan
            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
        seconds = time.perf_counter() - start
        if rank == 0:
            print(
                f"{obs_name} {value!r} {seconds:.3f} "
                f"{backend.last_truncation_error:.6e} "
                f"{backend.last_max_truncation_error:.6e} {status}",
                flush=True,
            )
 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--label", required=True)
    parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
    parser.add_argument("--nqubits", type=int, required=True)
    parser.add_argument("--nlayers", type=int, required=True)
    parser.add_argument("--bond", type=optional_int, required=True)
    parser.add_argument("--cut-ratio", type=optional_float, required=True)
    parser.add_argument("--seed", type=int, required=True)
    parser.add_argument("--torch-threads", type=int, required=True)
    parser.add_argument("--obs-filter", default="")
    run_case(parser.parse_args())
 if __name__ == "__main__":
    main()
--- a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl
+++ b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl