diff --git a/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py b/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
index 1bbf75f..b75c52d 100644
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
@@ -1573,6 +1573,23 @@ def _combine_1q_gate_run(gates, array_fn=None):
     return Gate.from_raw(G, gates[0].qubits)
 
 
+def _combine_2q_gate_run(gates, array_fn=None):
+    """Combine a run of two qubit gates in application order."""
+    gates = tuple(gate for _, gate in gates)
+    G = gates[0].array
+    if array_fn is not None:
+        G = array_fn(G)
+    G = reshape(G, (4, 4))
+
+    for gate in gates[1:]:
+        Gi = gate.array
+        if array_fn is not None:
+            Gi = array_fn(Gi)
+        G = reshape(Gi, (4, 4)) @ G
+
+    return Gate.from_raw(reshape(G, (2, 2, 2, 2)), gates[0].qubits)
+
+
 def _can_merge_1q_gate(gate):
     return (
         (gate.controls is None)
@@ -1583,48 +1600,96 @@ def _can_merge_1q_gate(gate):
     )
 
 
-def _iter_gates_with_merged_1q_runs(gates):
+def _can_merge_2q_gate(gate):
+    return (
+        (gate.controls is None)
+        and (not gate.special)
+        and (not gate.parametrize)
+        and (gate.qubits is not None)
+        and (len(gate.qubits) == 2)
+    )
+
+
+def _iter_gates_with_merged_runs(gates, merge_1q=True, merge_2q=True):
     """Yield ``(gate_to_apply, gates_to_record)``, merging adjacent runs of
-    single qubit gates that are not interrupted by any operation touching the
-    same qubit.
+    local gates that are not interrupted by any operation touching the same
+    qubits.
     """
-    pending = {}
+    pending_1q = {}
+    pending_2q = {}
 
     def flush_qubit(q):
-        run = pending.pop(q, None)
+        run = pending_1q.pop(q, None)
         if run is None:
             return
         if len(run) == 1:
             return run[0][1], run
         return None, run
 
+    def flush_pair(pair):
+        run = pending_2q.pop(pair, None)
+        if run is None:
+            return
+        if len(run) == 1:
+            return run[0][1], run
+        return None, run
+
+    def flush_touched(touched, keep_qubit=None, keep_pair=None):
+        for q in tuple(pending_1q):
+            if q == keep_qubit:
+                continue
+            if q in touched:
+                item = flush_qubit(q)
+                if item is not None:
+                    yield item
+
+        for pair in tuple(pending_2q):
+            if pair == keep_pair:
+                continue
+            if touched.intersection(pair):
+                item = flush_pair(pair)
+                if item is not None:
+                    yield item
+
     def flush_all():
-        for q in tuple(pending):
+        for q in tuple(pending_1q):
             item = flush_qubit(q)
             if item is not None:
                 yield item
+        for pair in tuple(pending_2q):
+            item = flush_pair(pair)
+            if item is not None:
+                yield item
 
     for i, gate in enumerate(gates):
-        if _can_merge_1q_gate(gate):
+        if merge_1q and _can_merge_1q_gate(gate):
             (q,) = gate.qubits
-            pending.setdefault(q, []).append((i, gate))
+            yield from flush_touched({q}, keep_qubit=q)
+            pending_1q.setdefault(q, []).append((i, gate))
+            continue
+
+        if merge_2q and _can_merge_2q_gate(gate):
+            pair = gate.qubits
+            yield from flush_touched(set(pair), keep_pair=pair)
+            pending_2q.setdefault(pair, []).append((i, gate))
             continue
 
         touched = set(gate.qubits or ())
         if gate.controls:
             touched.update(gate.controls)
 
-        for q in tuple(pending):
-            if q in touched:
-                item = flush_qubit(q)
-                if item is not None:
-                    yield item
+        yield from flush_touched(touched)
 
         yield gate, ((i, gate),)
 
     yield from flush_all()
 
 
+_iter_gates_with_merged_1q_runs = functools.partial(
+    _iter_gates_with_merged_runs, merge_1q=True, merge_2q=False
+)
+
+
 # --------------------------- main circuit class ---------------------------- #
 
 
@@ -2103,6 +2168,24 @@ class Circuit:
 
         self._psi.gate_(G, gates[0][1].qubits, tags=tags, **opts)
 
+    def _apply_merged_2q_gate_run(self, gates, gate_number_offset=0, **gate_opts):
+        tags = tags_to_oset(gate_opts.pop("tags", None))
+        for i, gate in gates:
+            tags |= self._gate_tags_for_record(
+                gate, gate_number=gate_number_offset + i
+            )
+
+        opts = {**self.gate_opts, **gate_opts}
+
+        if self.convert_eager:
+            G = _combine_2q_gate_run(
+                gates, array_fn=self._maybe_convert_gate_array
+            ).array
+        else:
+            G = _combine_2q_gate_run(gates).array
+
+        self._psi.gate_(G, gates[0][1].qubits, tags=tags, **opts)
+
     def apply_gate(
         self,
         gate_id,
@@ -2178,11 +2261,14 @@ class Circuit:
             Supplied to :meth:`~quimb.tensor.circuit.Circuit.apply_gate`.
         """
         merge_1q = gate_opts.pop("merge_1q", "auto")
+        merge_2q = gate_opts.pop("merge_2q", "auto")
 
         if merge_1q == "auto":
             merge_1q = True
+        if merge_2q == "auto":
+            merge_2q = True
 
-        if merge_1q:
+        if merge_1q or merge_2q:
             gates = tuple(
                 gate if isinstance(gate, Gate) else parse_to_gate(gate)
                 for gate in gates
@@ -2195,15 +2281,22 @@ class Circuit:
                 pbar = _progbar(total=len(gates))
 
             gate_number_offset = len(self._gates)
-            for gate, gates_to_record in _iter_gates_with_merged_1q_runs(
-                gates
+            for gate, gates_to_record in _iter_gates_with_merged_runs(
+                gates, merge_1q=merge_1q, merge_2q=merge_2q
             ):
                 if gate is None:
-                    self._apply_merged_1q_gate_run(
-                        gates_to_record,
-                        gate_number_offset=gate_number_offset,
-                        **gate_opts,
-                    )
+                    if len(gates_to_record[0][1].qubits) == 1:
+                        self._apply_merged_1q_gate_run(
+                            gates_to_record,
+                            gate_number_offset=gate_number_offset,
+                            **gate_opts,
+                        )
+                    else:
+                        self._apply_merged_2q_gate_run(
+                            gates_to_record,
+                            gate_number_offset=gate_number_offset,
+                            **gate_opts,
+                        )
                 else:
                     self._apply_gate(
                         gate,
@@ -4892,11 +4985,16 @@ class CircuitMPS(Circuit):
 
     def apply_gates(self, gates, progbar=False, **gate_opts):
         merge_1q = gate_opts.pop("merge_1q", "auto")
+        merge_2q = gate_opts.pop("merge_2q", "auto")
 
         if merge_1q == "auto":
             merge_1q = True
+        if merge_2q == "auto":
+            # MPS truncation semantics are sensitive to when a 2q gate is
+            # materialized, so keep the default conservative here.
+            merge_2q = False
 
-        if merge_1q:
+        if merge_1q or merge_2q:
             gates = tuple(
                 gate if isinstance(gate, Gate) else parse_to_gate(gate)
                 for gate in gates
@@ -4913,15 +5011,22 @@ class CircuitMPS(Circuit):
                 )
 
             gate_number_offset = len(self._gates)
-            for gate, gates_to_record in _iter_gates_with_merged_1q_runs(
-                gates
+            for gate, gates_to_record in _iter_gates_with_merged_runs(
+                gates, merge_1q=merge_1q, merge_2q=merge_2q
             ):
                 if gate is None:
-                    self._apply_merged_1q_gate_run(
-                        gates_to_record,
-                        gate_number_offset=gate_number_offset,
-                        **gate_opts,
-                    )
+                    if len(gates_to_record[0][1].qubits) == 1:
+                        self._apply_merged_1q_gate_run(
+                            gates_to_record,
+                            gate_number_offset=gate_number_offset,
+                            **gate_opts,
+                        )
+                    else:
+                        self._apply_merged_2q_gate_run(
+                            gates_to_record,
+                            gate_number_offset=gate_number_offset,
+                            **gate_opts,
+                        )
                     gate_for_progress = gates_to_record[-1][1]
                 else:
                     self._apply_gate(
diff --git a/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py b/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
index a27060f..43ca8d8 100644
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
@@ -5050,8 +5050,6 @@ class TNLinearOperator1D(spla.LinearOperator):
 
         if self.is_conj:
             T = T.conj()
-        print(T)
-        assert(0)
         return T.to_dense(self.left_inds, self.right_inds)
 
     def toarray(self):
diff --git a/README.md b/README.md
index 150b8e8..440a9fa 100644
--- a/README.md
+++ b/README.md
@@ -28,15 +28,24 @@ Currently, the supported tensor network libraries are:
 
 ## CPU expectation benchmarks
 
-The current CPU expectation entrypoint is:
+Use the library APIs directly:
 
-```sh
-python -u benchmark_cpu_expectation.py --ansatz mps --nqubits 40 --nlayers 10 --bond 2048 --circuits brickwall_cnot --observables ring_xz
+```py
+import qibotn
+
+records = qibotn.run_cpu_benchmark_cases(
+    ansatz="mps",
+    nqubits=40,
+    nlayers=10,
+    bond=2048,
+    circuits=("brickwall_cnot",),
+    observables=("ring_xz",),
+)
 ```
 
-Use `--ansatz tn` for the generic TN path and `--mpi` under `mpiexec` for MPI runs.
-Reusable circuit and observable builders live in `src/qibotn/benchmark_cases.py`; execution logic lives in `src/qibotn/expectation_runner.py`.
-For Vidal/MPS 1D-chain scale tests, use `run_vidal_mps_cases.sh`.
+For generic TN use `ansatz="tn"`.  Contest/custom runners are available as
+`qibotn.run_contest_tn_case`, `qibotn.run_custom_tn_expectation`,
+`qibotn.run_contest_mps_case`, and `qibotn.run_vidal_validation_cases`.
 
 ## Installation
 
diff --git a/benchmark_cpu_expectation.py b/benchmark_cpu_expectation.py
deleted file mode 100644
index 3d5897d..0000000
--- a/benchmark_cpu_expectation.py
+++ /dev/null
@@ -1,285 +0,0 @@
-"""CLI for CPU TN/MPS expectation benchmarks."""
-
-from __future__ import annotations
-
-import argparse
-import os
-import subprocess
-from pathlib import Path
-from urllib.parse import urlparse
-
-from qibotn.benchmark_cases import (
-    CIRCUITS,
-    OBSERVABLES,
-    build_circuit,
-    observable_terms,
-    parse_names,
-    terms_to_dict,
-)
-from qibotn.expectation_runner import (
-    ExpectationConfig,
-    exact_for_observable,
-    run_cpu_expectation,
-)
-
-
-def optional_int(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return int(text)
-
-
-def optional_float(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return float(text)
-
-
-def format_optional(value, fmt="g"):
-    return "None" if value is None else format(value, fmt)
-
-
-def should_stop_dask(args):
-    return (
-        not args.keep_dask
-        and args.tn_search_backend == "dask"
-        and args.dask_address is not None
-        and args.tn_load_tree is None
-    )
-
-
-def stop_dask_cluster(args, rank):
-    if rank != 0 or not should_stop_dask(args):
-        return
-    script = Path(__file__).resolve().parent / "tools" / "manage_tn_dask_cluster.sh"
-    if not script.exists():
-        print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
-        return
-
-    env = os.environ.copy()
-    parsed = urlparse(args.dask_address)
-    if parsed.hostname:
-        env.setdefault("SCHEDULER_HOST", parsed.hostname)
-    if parsed.port:
-        env.setdefault("SCHEDULER_PORT", str(parsed.port))
-
-    print("dask_stop_after_search start", flush=True)
-    subprocess.run([str(script), "stop"], cwd=str(script.parent.parent), env=env, check=False)
-    print("dask_stop_after_search done", flush=True)
-
-
-def build_parallel_opts(args):
-    slicing_opts = {}
-    if args.tn_target_slices is not None:
-        slicing_opts["target_slices"] = args.tn_target_slices
-    if args.tn_target_size is not None:
-        slicing_opts["target_size"] = args.tn_target_size
-
-    opts = {
-        "slicing_opts": slicing_opts or None,
-        "search_workers": args.tn_search_workers or args.torch_threads,
-        "max_repeats": args.tn_search_repeats,
-        "max_time": args.tn_search_time,
-        "print_stats": not args.no_tn_stats,
-    }
-    if args.tn_search_backend is not None:
-        opts["search_backend"] = args.tn_search_backend
-    if args.dask_address is not None:
-        opts["dask_address"] = args.dask_address
-    if args.tn_save_tree is not None:
-        opts["save_tree_path"] = args.tn_save_tree
-    if args.tn_load_tree is not None:
-        opts["load_tree_path"] = args.tn_load_tree
-    if args.tn_search_only:
-        opts["search_only"] = True
-    if args.tn_debug_trials:
-        opts["debug_trials"] = True
-    if args.tn_contract_implementation is not None:
-        opts["contract_implementation"] = args.tn_contract_implementation
-    if args.dask_close_workers:
-        opts["dask_close_workers"] = True
-    return opts
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=40)
-    parser.add_argument("--nlayers", type=int, default=30)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
-    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--torch-threads", type=int, default=8)
-    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
-    parser.add_argument(
-        "--dtype",
-        choices=("complex128", "complex64"),
-        default="complex128",
-    )
-    parser.add_argument("--ansatz", choices=("tn", "mps"), default=None)
-    parser.add_argument("--mps", action="store_true")
-    parser.add_argument("--mpi", action="store_true")
-    parser.add_argument("--exact", action="store_true")
-    parser.add_argument("--exact-max-qubits", type=int, default=24)
-    parser.add_argument("--circuits", nargs="+", default=["brickwall_cnot"])
-    parser.add_argument("--observables", nargs="+", default=["ring_xz"])
-    parser.add_argument("--pauli-pattern")
-    parser.add_argument("--tn-target-slices", type=int)
-    parser.add_argument("--tn-target-size", type=int,default=2**32)
-    parser.add_argument("--tn-search-workers", type=int)
-    parser.add_argument("--tn-search-repeats", type=int, default=128)
-    parser.add_argument("--tn-search-time", type=float, default=60.0)
-    parser.add_argument(
-        "--no-tn-stats",
-        action="store_true",
-        help="Do not print per-term TN search/contraction diagnostics.",
-    )
-    parser.add_argument(
-        "--tn-search-backend",
-        choices=("processpool", "dask"),
-        default="dask",
-        help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.",
-    )
-    parser.add_argument(
-        "--dask-address",
-        help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.",
-    )
-    parser.add_argument(
-        "--dask-close-workers",
-        action="store_true",
-        help="After dask path search, ask the scheduler to close all currently connected workers.",
-    )
-    parser.add_argument(
-        "--keep-dask",
-        action="store_true",
-        help=(
-            "Keep an external dask cluster running after search. By default, "
-            "tools/manage_tn_dask_cluster.sh stop is called after search when "
-            "--dask-address is used."
-        ),
-    )
-    parser.add_argument(
-        "--tn-save-tree",
-        help="Save searched cotengra contraction tree(s) to this pickle file.",
-    )
-    parser.add_argument(
-        "--tn-load-tree",
-        help="Load cotengra contraction tree(s) from this pickle file and skip path search.",
-    )
-    parser.add_argument(
-        "--tn-search-only",
-        action="store_true",
-        help="Only run path search and optional --tn-save-tree; skip contraction.",
-    )
-    parser.add_argument(
-        "--tn-debug-trials",
-        action="store_true",
-        help="Print dask worker summary and per-trial worker start/done logs.",
-    )
-    parser.add_argument(
-        "--tn-contract-implementation",
-        choices=("auto", "cotengra", "autoray", "cpp"),
-        help="cotengra contraction implementation for TN contraction.",
-    )
-    args = parser.parse_args()
-
-    ansatz = "mps" if args.mps else (args.ansatz or "tn")
-    circuits = parse_names(args.circuits, CIRCUITS, "circuits")
-    observables = [] if args.pauli_pattern else parse_names(
-        args.observables, OBSERVABLES, "observables"
-    )
-
-    rank = 0
-    if args.mpi:
-        from mpi4py import MPI
-
-        rank = MPI.COMM_WORLD.Get_rank()
-
-    config = ExpectationConfig(
-        ansatz=ansatz,
-        mpi=args.mpi,
-        bond=args.bond,
-        cut_ratio=args.cut_ratio,
-        tensor_module="torch",
-        quimb_backend=args.quimb_backend,
-        dtype=args.dtype,
-        torch_threads=args.torch_threads,
-        parallel_opts=build_parallel_opts(args),
-    )
-
-    if rank == 0:
-        mode = "MPI" if args.mpi else "serial"
-        print(
-            f"backend=cpu ansatz={ansatz.upper()} mode={mode} "
-            f"nqubits={args.nqubits} nlayers={args.nlayers} "
-            f"bond={format_optional(args.bond)} "
-            f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
-            f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
-            f"torch_threads={args.torch_threads} "
-            f"tn_search_backend={args.tn_search_backend}"
-        )
-        print("circuit observable exact value abs_error rel_error seconds")
-
-    try:
-        for circuit_kind in circuits:
-            circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
-            named_observables = (
-                [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})]
-                if args.pauli_pattern
-                else [
-                    (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits)))
-                    for obs_kind in observables
-                ]
-            )
-
-            for obs_name, observable in named_observables:
-                exact = None
-                if args.exact and rank == 0:
-                    if args.nqubits > args.exact_max_qubits:
-                        raise ValueError(
-                            f"--exact is limited to {args.exact_max_qubits} qubits by default."
-                        )
-                    exact = exact_for_observable(circuit, observable, args.nqubits)
-
-                result = run_cpu_expectation(circuit, observable, config)
-                if args.mpi and result.rank != 0:
-                    continue
-
-                abs_error = float("nan") if exact is None else abs(result.value - exact)
-                rel_error = (
-                    float("nan")
-                    if exact is None
-                    else abs_error / max(abs(exact), 1e-15)
-                )
-                exact_text = "nan" if exact is None else f"{exact:.16e}"
-                print(
-                    f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
-                    f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
-                )
-                for stat in result.parallel_stats or ():
-                    cost = stat["path_cost"]
-                    search_stats = stat.get("search_stats", {})
-                    print(
-                        "tn_term_summary "
-                        f"term={stat.get('term_index', 0)} "
-                        f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
-                        f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
-                        f"completed_trials={search_stats.get('completed_trials', 'na')} "
-                        f"finite_trials={search_stats.get('finite_trials', 'na')} "
-                        f"failed_trials={search_stats.get('failed_trials', 'na')} "
-                        f"requested_trials={search_stats.get('requested_trials', 'na')} "
-                        f"best_score={search_stats.get('best_score', float('nan')):.6g} "
-                        f"slices={cost['nslices']} "
-                        f"log10_flops={cost['log10_flops']:.3f} "
-                        f"log10_write={cost['log10_write']:.3f} "
-                        f"log2_size={cost['log2_size']:.3f} "
-                        f"log10_combo={cost['log10_combo']:.3f} "
-                        f"peak_memory_gib={cost['peak_memory_gib']:.6g} "
-                        f"slicing_overhead={cost['slicing_overhead']:.6g} "
-                        f"rank_slices={stat.get('rank_slices', 'na')}"
-                    )
-    finally:
-        stop_dask_cluster(args, rank)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/docs/contest_runners.md b/docs/contest_runners.md
index 5298328..406b68e 100644
--- a/docs/contest_runners.md
+++ b/docs/contest_runners.md
@@ -1,88 +1,12 @@
-# TN
-```bash
-# search + contract，Open MPI 多节点：每节点 2 rank，每 rank 绑定 1 个 NUMA。
-# MPI_HOSTS 里每个节点写 :2，MPI_RANKS = 节点数 * 2。
-# 每个 rank 使用 MPI_PE 个 core；这台 2-NUMA AMD 节点用 MPI_PE=128。
+# Contest Runners
 
-NQUBITS=40 \
-TN_DEBUG_TRIALS=1 \
-SCHEDULER_HOST=10.20.1.100 \
-DASK_ADDRESS=tcp://10.20.1.100:8786 \
-WORKER_HOSTS="10.20.1.100 10.20.1.101 10.20.1.102 10.20.1.103" \
-CASE=main1 \
-OBSERVABLES=long_z_string \
-TORCH_THREADS=80 \
-MPI_PE=80 \
-MPI_MAP_BY=ppr:1:numa:PE=80 \
-MPI_BIND_TO=core \
-OMP_NUM_THREADS=80 \
-MKL_NUM_THREADS=80 \
-BLIS_NUM_THREADS=80 \
-MPI_HOSTS="node-0:2,node-1:2,node-2:2,node-3:2" \
-MPI_RANKS=8 \
-NWORKERS=96 \
-TN_TARGET_SIZE=17179869184 \
-tools/run_tn_dask_mpi_all.sh
+The reusable implementations live in `src/qibotn/backends/`.
 
-# 单独缩并contract计算
+- `qibotn.run_contest_tn_case`: quimb+torch TN search/contract cases.
+- `qibotn.run_contest_mps_case`: Vidal/MPS contest expectation cases.
+- `qibotn.run_vidal_mpi_contest_case`: direct Vidal MPI observable sweep.
+- `qibotn.run_custom_tn_expectation`: custom quimb+torch TN cases.
 
-mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
-  -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
-  -x BLIS_NUM_THREADS=80 \
-  -x OMP_NUM_THREADS=80 \
-  -x MKL_NUM_THREADS=80 \
-  -x OMP_PROC_BIND=close \
-  -x OMP_PLACES=cores \
-  -np 8 \
-  -host node-0:2,node-1:2,node-2:2,node-3:2 \
-  .venv/bin/python -u tools/tn_contest_runner.py contract \
-  --mpi \
-  --case main1 \
-  --nqubits 34 \
-  --nlayers 20 \
-  --observables long_z_string \
-  --tree-dir trees/contest_tn \
-  --torch-threads 80 \
-  --dtype complex64
-```
-
-# MPS
-```
-cd /home/qibo/qibotn
-
-MPIEXEC=mpirun \
-MPI_HOSTS="node-2:4,node-3:4" \
-MPI_RANKS=8 \
-MPI_PE=48 \
-MPI_MAP_BY=ppr:2:numa:PE=48 \
-MPI_BIND_TO=core \
-MPI_REPORT_BINDINGS=1 \
-TORCH_THREADS=48 \
-OMP_NUM_THREADS=48 \
-MKL_NUM_THREADS=48 \
-BLIS_NUM_THREADS=48 \
-OBS_FILTER=ring_xz \
-MAIN1_NQ=128 \
-MAIN1_LAYERS=24 \
-MAIN1_BOND=1024 \
-tools/run_vidal_mpi_contest_cases.sh main1
-
-
-
-MPIEXEC=mpirun \
-MPI_HOSTS="node-2:4" \
-MPI_RANKS=4 \
-MPI_PE=48 \
-MPI_MAP_BY=ppr:2:numa:PE=48 \
-MPI_BIND_TO=core \
-MPI_REPORT_BINDINGS=1 \
-TORCH_THREADS=48 \
-OMP_NUM_THREADS=48 \
-MKL_NUM_THREADS=48 \
-BLIS_NUM_THREADS=48 \
-OBS_FILTER=ring_xz \
-MAIN1_NQ=128 \
-MAIN1_LAYERS=24 \
-MAIN1_BOND=1024 \
-tools/run_vidal_mpi_contest_cases.sh main1
-```
+`src/qibotn/backends/quimb.py` holds the TN helpers,
+`src/qibotn/backends/qmatchatea.py` holds the qmatchatea MPS helpers,
+and `src/qibotn/backends/vidal.py` holds the Vidal helpers.
diff --git a/docs/home.md b/docs/home.md
new file mode 100644
index 0000000..a6bb8c1
--- /dev/null
+++ b/docs/home.md
@@ -0,0 +1,26 @@
+# qibotn
+
+Core reusable code lives under `src/qibotn/`. Prefer importing from `qibotn`
+or `qibotn.backends.*`; benchmark and runner helpers have been folded into the
+package instead of being kept as standalone scripts.
+
+- `backends/quimb.py`: TN + torch helpers for quimb.
+- `backends/qmatchatea.py`: qmatchatea + torch MPS helpers.
+- `backends/vidal.py`: Vidal + torch helpers.
+- `contest_cases.py`: shared contest circuits, observables, and case specs.
+- `torch_utils.py`: shared torch array/thread helpers.
+
+Quimb TN reusable entrypoints include `build_quimb_backend_circuit`,
+`build_expectation_tn`, `run_quimb_torch_expectation`,
+`compare_quimb_gate_merge`, `compare_quimb_gate_merge_expectation`,
+`profile_quimb_torch_expectation`, and `time_quimb_contract_implementations`.
+
+Common public imports include `qibotn.cpu_expectation`,
+`qibotn.mps_expectation`, `qibotn.run_qmatchatea_expectation`,
+`qibotn.run_vidal_expectation`, `qibotn.build_contest_circuit`, and
+`qibotn.build_contest_observable`.
+
+Former script entrypoints are available as importable functions:
+`qibotn.run_cpu_benchmark_cases`, `qibotn.run_contest_tn_case`,
+`qibotn.run_custom_tn_expectation`, `qibotn.run_contest_mps_case`,
+`qibotn.run_vidal_mpi_contest_case`, and `qibotn.run_vidal_validation_cases`.
diff --git a/docs/xianchang.md b/docs/xianchang.md
deleted file mode 100644
index 57411cc..0000000
--- a/docs/xianchang.md
+++ /dev/null
@@ -1,42 +0,0 @@
-mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
-  -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
-  -x BLIS_NUM_THREADS=80 \
-  -x OMP_NUM_THREADS=80 \
-  -x MKL_NUM_THREADS=80 \
-  -x OMP_PROC_BIND=close \
-  -x OMP_PLACES=cores \
-  -np 4 \
-  -host node-0:2,node-1:2,node-2:2,node-3:2 \
-  .venv/bin/python -u tools/tn_contest_runner.py contract \
-  --mpi \
-  --case main1 \
-  --nqubits 34 \
-  --nlayers 20 \
-  --observables long_z_string \
-  --tree-dir trees/contest_tn \
-  --torch-threads 80 \
-  --dtype complex64
-
-
-SEARCH_TIME=300  NQUBITS=40 TN_DEBUG_TRIALS=1 SCHEDULER_HOST=10.20.1.102 DASK_ADDRESS=tcp://10.20.1.102:8786 WORKER_HOSTS="10.20.1.102 10.20.1.103" CASE=main1 OBSERVABLES=long_z_string TORCH_THREADS=80 MPI_PE=80 MPI_MAP_BY=ppr:1:numa:PE=80 MPI_BIND_TO=core OMP_NUM_THREADS=80 MKL_NUM_THREADS=80 BLIS_NUM_THREADS=80 MPI_HOSTS="node-2:2,node-3:2" MPI_RANKS=4 NWORKERS=128  TN_TARGET_SIZE=17179869184 tools/run_tn_dask_mpi_all.sh
-
-
-NQUBITS=40 \
-TN_DEBUG_TRIALS=1 \
-SCHEDULER_HOST=10.20.1.102 \
-DASK_ADDRESS=tcp://10.20.1.102:8786 \
-WORKER_HOSTS="10.20.1.102 10.20.1.103" \
-CASE=main1 \
-OBSERVABLES=long_z_string \
-TORCH_THREADS=80 \
-MPI_PE=80 \
-MPI_MAP_BY=ppr:1:numa:PE=80 \
-MPI_BIND_TO=core \
-OMP_NUM_THREADS=80 \
-MKL_NUM_THREADS=80 \
-BLIS_NUM_THREADS=80 \
-MPI_HOSTS="node-2:2,node-3:2" \
-MPI_RANKS=4 \
-NWORKERS=96 \
-TN_TARGET_SIZE=17179869184 \
-tools/run_tn_dask_mpi_all.sh
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 7ac26d8..6d668fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -60,7 +60,7 @@ mpmath==1.3.0
 msgpack==1.1.2
 networkx==3.6.1
 numba==0.61.2
-numpy==2.0.1
+numpy @ file:///home/yx/numpy
 openqasm3==1.0.1
 opt_einsum==3.4.0
 optuna==4.8.0
@@ -93,7 +93,7 @@ python-multipart==0.0.26
 PyYAML==6.0.3
 qibo==0.3.2
 qibojit==0.1.15
--e git+https://git.nudt.space/jaunatisblue/qibotn.git@4c7a10d026d514897dcc501b507fa604fb4e52d4#egg=qibotn
+-e git+https://git.nudt.space/jaunatisblue/qibotn.git@eed42dcfa9739c609a58f7367fe403abf2e992a9#egg=qibotn
 qiskit==1.4.5
 qmatchatea==1.5.8
 qredtea==0.3.15
@@ -106,7 +106,7 @@ regex==2026.4.4
 requests==2.33.1
 rpds-py==0.30.0
 rustworkx==0.17.1
-scipy==1.17.1
+scipy @ file:///home/yx/scipy
 setuptools==70.2.0
 six==1.17.0
 sniffio==1.3.1
@@ -118,13 +118,15 @@ stack-data==0.6.3
 starlette==1.0.0
 stevedore==5.7.0
 symengine==0.13.0
-sympy==1.13.1
+sympy==1.14.0
 tabulate==0.9.0
 tblib==3.2.2
 texttable==1.7.0
 threadpoolctl==3.6.0
 toolz==1.1.0
-torch @ file:///home/qibo/qibotn/wheels/torch-2.10.0a0+a36e1d3-cp312-cp312-linux_x86_64.whl
+torch==2.11.0+cpu
+torchaudio==2.11.0+cpu
+torchvision==0.26.0+cpu
 tornado==6.5.5
 tqdm==4.67.3
 traitlets==5.14.3
@@ -135,4 +137,3 @@ uvicorn==0.46.0
 wcwidth==0.6.0
 webencodings==0.5.1
 zict==3.0.0
-
diff --git a/run_vidal_mps_cases.sh b/run_vidal_mps_cases.sh
deleted file mode 100755
index 93d0268..0000000
--- a/run_vidal_mps_cases.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Focused Vidal/MPS expectation test cases for 1D chain circuits.
-#
-# These cases intentionally avoid qmatchatea and generic TN paths.  They target
-# the current supported scope: one-qubit gates, adjacent two-qubit gates, and
-# Pauli-sum expectation values on a 1D chain.
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-cd "$ROOT_DIR"
-
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-MPIEXEC="${MPIEXEC:-mpiexec}"
-HOSTFILE="${HOSTFILE:-hostfile}"
-
-THREADS="${THREADS:-32}"
-MPI_RANKS="${MPI_RANKS:-16}"
-MPI_THREADS="${MPI_THREADS:-12}"
-
-export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
-export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
-source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
-
-run() {
-  echo
-  echo "--------------------------------------------------------------------------------"
-  echo "$*"
-  echo "--------------------------------------------------------------------------------"
-  "$@"
-}
-
-case "${1:-help}" in
-  smoke)
-    # Short correctness-oriented run.  Useful before starting long jobs.
-    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      --mps \
-      --nqubits 40 \
-      --nlayers 10 \
-      --bond 2048 \
-      --torch-threads "$THREADS" \
-      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz range2_xx long_z_string
-    ;;
-
-  convergence)
-    # Same circuit/observable, increasing bond.  Check value convergence.
-    for bond in ${BONDS:-4096 16384 65536}; do
-      run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-        --mps \
-        --nqubits "${NQ:-80}" \
-        --nlayers "${LAYERS:-16}" \
-        --bond "$bond" \
-        --torch-threads "$THREADS" \
-        --circuits "${CIRCUIT:-brickwall_cnot}" \
-        --observables "${OBSERVABLE:-ring_xz}"
-    done
-    ;;
-
-  single-long)
-    # Single long Vidal run.  On node-3, a similar n=40,l=30,bond=2048 case
-    # took about 9 minutes for one expectation.  This one is meant to be longer.
-    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      --mps \
-      --nqubits "${NQ:-80}" \
-      --nlayers "${LAYERS:-16}" \
-      --bond "${BOND:-65536}" \
-      --torch-threads "$THREADS" \
-      --circuits "${CIRCUIT:-brickwall_cnot}" \
-      --observables "${OBSERVABLE:-ring_xz}"
-    ;;
-
-  suite-long)
-    # Application-style multi-circuit, multi-observable MPS run.
-    # This is intentionally multi-term and should run much longer than single-long.
-    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      --mps \
-      --nqubits "${NQ:-80}" \
-      --nlayers "${LAYERS:-16}" \
-      --bond "${BOND:-65536}" \
-      --torch-threads "$THREADS" \
-      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz mixed_local range2_xx long_z_string
-    ;;
-
-  mpi-long)
-    # Multi-node Vidal segmented MPS run.  Uses HOSTFILE.
-    run "$MPIEXEC" -hostfile "$HOSTFILE" -n "$MPI_RANKS" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      --mpi --mps \
-      --nqubits "${NQ:-80}" \
-      --nlayers "${LAYERS:-16}" \
-      --bond "${BOND:-65536}" \
-      --torch-threads "$MPI_THREADS" \
-      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz mixed_local range2_xx long_z_string
-    ;;
-
-  stress)
-    # Heavier entanglement.  Start only after single-long is stable.
-    run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      --mps \
-      --nqubits "${NQ:-80}" \
-      --nlayers "${LAYERS:-18}" \
-      --bond "${BOND:-262144}" \
-      --torch-threads "${THREADS:-48}" \
-      --circuits "${CIRCUIT:-rxx_rzz}" \
-      --observables ring_xz open_zz range2_xx
-    ;;
-
-  help|*)
-    cat <<'EOF'
-Usage: ./run_vidal_mps_cases.sh [smoke|convergence|single-long|suite-long|mpi-long|stress]
-
-Common overrides:
-  PYTHON_BIN=.venv/bin/python
-  THREADS=32
-  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
-
-Single-node scale overrides:
-  NQ=80 LAYERS=16 BOND=65536
-  CIRCUIT=brickwall_cnot
-  OBSERVABLE=ring_xz
-  BONDS="4096 16384 65536"   # for convergence mode
-
-Multi-node overrides:
-  HOSTFILE=hostfile
-  MPI_RANKS=16 MPI_THREADS=12
-
-Recommended first runs:
-  ./run_vidal_mps_cases.sh smoke
-  ./run_vidal_mps_cases.sh convergence
-  ./run_vidal_mps_cases.sh single-long
-EOF
-    ;;
-esac
diff --git a/src/qibotn/__init__.py b/src/qibotn/__init__.py
index fb2c1f7..9a1ee8a 100644
--- a/src/qibotn/__init__.py
+++ b/src/qibotn/__init__.py
@@ -8,6 +8,108 @@ _LAZY_EXPORTS = {
     "cpu_expectation": ("qibotn.expectation_runner", "cpu_expectation"),
     "mps_expectation": ("qibotn.expectation_runner", "mps_expectation"),
     "cpu_runcard": ("qibotn.expectation_runner", "cpu_runcard"),
+    "ExpectationConfig": ("qibotn.expectation_runner", "ExpectationConfig"),
+    "exact_for_observable": ("qibotn.expectation_runner", "exact_for_observable"),
+    "run_cpu_expectation": ("qibotn.expectation_runner", "run_cpu_expectation"),
+    "cpu_benchmark_parallel_opts": (
+        "qibotn.expectation_runner",
+        "cpu_benchmark_parallel_opts",
+    ),
+    "run_cpu_benchmark_cases": (
+        "qibotn.expectation_runner",
+        "run_cpu_benchmark_cases",
+    ),
+    "build_benchmark_circuit": ("qibotn.benchmark_cases", "build_circuit"),
+    "benchmark_observable_terms": ("qibotn.benchmark_cases", "observable_terms"),
+    "exact_pauli_sum": ("qibotn.benchmark_cases", "exact_pauli_sum"),
+    "ring_xz_statevector_expectation": (
+        "qibotn.benchmark_cases",
+        "ring_xz_statevector_expectation",
+    ),
+    "terms_to_dict": ("qibotn.benchmark_cases", "terms_to_dict"),
+    "build_contest_circuit": ("qibotn.contest_cases", "build_contest_circuit"),
+    "build_contest_observable": (
+        "qibotn.contest_cases",
+        "build_contest_observable",
+    ),
+    "contest_cases": ("qibotn.contest_cases", "CASES"),
+    "analyze_contraction_tree": ("qibotn.parallel", "analyze_contraction_tree"),
+    "load_tree_payload": ("qibotn.parallel", "load_tree_payload"),
+    "save_tree_payload": ("qibotn.parallel", "save_tree_payload"),
+    "slice_tree_payload": ("qibotn.parallel", "slice_tree_payload"),
+    "make_qmatchatea_backend": (
+        "qibotn.backends.qmatchatea",
+        "make_qmatchatea_backend",
+    ),
+    "build_qmatchatea_backend": (
+        "qibotn.backends.qmatchatea",
+        "build_qmatchatea_backend",
+    ),
+    "benchmark_qmatchatea_svd_control": (
+        "qibotn.backends.qmatchatea",
+        "benchmark_qmatchatea_svd_control",
+    ),
+    "run_qmatchatea_expectation": (
+        "qibotn.backends.qmatchatea",
+        "run_qmatchatea_expectation",
+    ),
+    "exact_mps_expectation": (
+        "qibotn.backends.qmatchatea",
+        "exact_mps_expectation",
+    ),
+    "make_vidal_backend": ("qibotn.backends.vidal", "make_vidal_backend"),
+    "compare_vidal_backend_qmatchatea": (
+        "qibotn.backends.vidal",
+        "compare_vidal_backend_qmatchatea",
+    ),
+    "run_vidal_expectation": ("qibotn.backends.vidal", "run_vidal_expectation"),
+    "run_segmented_vidal_ring_xz": (
+        "qibotn.backends.vidal",
+        "run_segmented_vidal_ring_xz",
+    ),
+    "build_expectation_tn": ("qibotn.backends.quimb", "build_expectation_tn"),
+    "build_quimb_circuit_stats": (
+        "qibotn.backends.quimb",
+        "build_quimb_circuit_stats",
+    ),
+    "compare_quimb_gate_merge": (
+        "qibotn.backends.quimb",
+        "compare_quimb_gate_merge",
+    ),
+    "compare_quimb_gate_merge_expectation": (
+        "qibotn.backends.quimb",
+        "compare_quimb_gate_merge_expectation",
+    ),
+    "contract_tn": ("qibotn.backends.quimb", "contract_tn"),
+    "load_custom_case_module": ("qibotn.backends.quimb", "load_custom_case_module"),
+    "profile_quimb_torch_expectation": (
+        "qibotn.backends.quimb",
+        "profile_quimb_torch_expectation",
+    ),
+    "qibo_circuit_to_quimb_torch": (
+        "qibotn.backends.quimb",
+        "qibo_circuit_to_quimb_torch",
+    ),
+    "search_contraction_tree": ("qibotn.backends.quimb", "search_contraction_tree"),
+    "sorted_tree": ("qibotn.backends.quimb", "sorted_tree"),
+    "run_contest_tn_case": ("qibotn.backends.quimb", "run_contest_tn_case"),
+    "run_custom_tn_expectation": (
+        "qibotn.backends.quimb",
+        "run_custom_tn_expectation",
+    ),
+    "time_quimb_contract_implementations": (
+        "qibotn.backends.quimb",
+        "time_quimb_contract_implementations",
+    ),
+    "run_contest_mps_case": ("qibotn.backends.vidal", "run_contest_mps_case"),
+    "run_vidal_mpi_contest_case": (
+        "qibotn.backends.vidal",
+        "run_vidal_mpi_contest_case",
+    ),
+    "run_vidal_validation_cases": (
+        "qibotn.backends.vidal",
+        "run_vidal_validation_cases",
+    ),
     "pauli_pattern": ("qibotn.observables", "pauli_pattern"),
     "pauli_sum": ("qibotn.observables", "pauli_sum"),
 }
diff --git a/src/qibotn/backends/cpu.py b/src/qibotn/backends/cpu.py
index 91b0528..c724cd9 100644
--- a/src/qibotn/backends/cpu.py
+++ b/src/qibotn/backends/cpu.py
@@ -18,6 +18,7 @@ from qibotn.backends.vidal import (
     _unsupported_reason,
 )
 from qibotn.observables import check_observable
+from qibotn.torch_utils import arrays_to_backend, torch_cpu_array, torch_dtype
 
 
 def _as_bool_or_dict(value, name):
@@ -310,10 +311,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
     def _quimb_backend(self):
         import qibotn.backends.quimb as qmb
 
-        return qmb.BACKENDS[self.quimb_backend](
+        backend = qmb.BACKENDS[self.quimb_backend](
             quimb_backend=self.quimb_backend,
             contraction_optimizer=self.contraction_optimizer,
         )
+        backend.dtype = self.dtype
+        return backend
 
     def _bind_rank_to_numa_domain(self, rank):
         self.numa_domain = _bind_numa_node(rank)
@@ -375,6 +378,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
         dask_close_workers = bool(opts.get("dask_close_workers", False))
         print_stats = bool(opts.get("print_stats", False))
         debug_trials = bool(opts.get("debug_trials", False))
+        search_seed = int(opts.get("search_seed", 0))
+        merge_1q = opts.get("merge_1q", "auto")
+        merge_2q = opts.get("merge_2q", "auto")
+        sort_contract_indices = opts.get("sort_contract_indices", "auto")
+        if sort_contract_indices == "auto":
+            sort_contract_indices = self.quimb_backend == "torch"
         search_only = bool(opts.get("search_only", False))
         save_tree_path = opts.get("save_tree_path")
         load_tree_path = opts.get("load_tree_path")
@@ -382,6 +391,38 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
         saved_trees = []
         saved_costs = []
 
+        def term_stats(
+            term_index,
+            factors,
+            path_cost,
+            search_stats,
+            tree_slices,
+            slice_assignment,
+            rank_slices,
+            search_seconds,
+            contract_seconds,
+        ):
+            return {
+                "term_index": term_index,
+                "term_factors": tuple(factors),
+                "path_cost": path_cost,
+                "search_stats": search_stats,
+                "tree_slices": tree_slices,
+                "slice_assignment": slice_assignment,
+                "rank_slices": rank_slices,
+                "search_seconds": search_seconds,
+                "contract_seconds": contract_seconds,
+                "search_workers": search_workers,
+                "search_repeats": search_repeats,
+                "search_time": search_time,
+                "search_backend": search_backend or method,
+                "search_seed": search_seed,
+                "merge_1q": merge_1q,
+                "merge_2q": merge_2q,
+                "dask_address": dask_address,
+                "numa_domain": getattr(self, "numa_domain", None),
+            }
+
         if load_tree_path:
             with Path(load_tree_path).open("rb") as f:
                 payload = pickle.load(f)
@@ -396,6 +437,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 "max_bond": self.max_bond_dimension,
                 "cutoff": self.cut_ratio,
             },
+            merge_1q=merge_1q,
+            merge_2q=merge_2q,
         )
 
         total_value = 0.0 + 0.0j
@@ -415,6 +458,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                 )
             else:
                 op, where = _pauli_term_to_dense_operator(factors)
+                if self.quimb_backend == "torch":
+                    op = torch_cpu_array(op, dtype=torch_dtype(self.dtype))
                 tn = qc.local_expectation(
                     op,
                     where,
@@ -455,10 +500,18 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                     debug_trials=debug_trials,
                     dask_close_workers=dask_close_workers,
                     expected_workers=dask_expected_workers,
+                    search_seed=search_seed,
                 )
                 search_seconds = time.perf_counter() - search_start
             if tree is None:
                 raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
+            if sort_contract_indices and hasattr(tree, "sort_contraction_indices"):
+                tree.sort_contraction_indices(
+                    priority=opts.get("sort_contract_indices_priority", "flops"),
+                    make_output_contig=True,
+                    make_contracted_contig=True,
+                    reset=True,
+                )
             if self.parallel_opts.get("contract_implementation") == "cpp":
                 from qibotn.torch_contractor import prepare_torch_cpp_contractor
 
@@ -490,23 +543,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
 
             if search_only:
                 self.parallel_stats.append(
-                    {
-                        "term_index": term_index,
-                        "term_factors": tuple(factors),
-                        "path_cost": path_cost,
-                        "search_stats": search_stats,
-                        "tree_slices": int(getattr(tree, "multiplicity", 1)),
-                        "slice_assignment": "search_only",
-                        "rank_slices": [],
-                        "search_seconds": search_seconds,
-                        "contract_seconds": 0.0,
-                        "search_workers": search_workers,
-                        "search_repeats": search_repeats,
-                        "search_time": search_time,
-                        "search_backend": search_backend or method,
-                        "dask_address": dask_address,
-                        "numa_domain": getattr(self, "numa_domain", None),
-                    }
+                    term_stats(
+                        term_index,
+                        factors,
+                        path_cost,
+                        search_stats,
+                        int(getattr(tree, "multiplicity", 1)),
+                        "search_only",
+                        [],
+                        search_seconds,
+                        0.0,
+                    )
                 )
                 continue
 
@@ -523,23 +570,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                             flush=True,
                         )
                     self.parallel_stats.append(
-                        {
-                            "term_index": term_index,
-                            "term_factors": tuple(factors),
-                            "path_cost": path_cost,
-                            "search_stats": search_stats,
-                            "tree_slices": 1,
-                            "slice_assignment": "root",
-                            "rank_slices": [1] + [0] * (size - 1),
-                            "search_seconds": search_seconds,
-                            "contract_seconds": contract_seconds,
-                            "search_workers": search_workers,
-                            "search_repeats": search_repeats,
-                            "search_time": search_time,
-                            "search_backend": search_backend or method,
-                            "dask_address": dask_address,
-                            "numa_domain": getattr(self, "numa_domain", None),
-                        }
+                        term_stats(
+                            term_index,
+                            factors,
+                            path_cost,
+                            search_stats,
+                            1,
+                            "root",
+                            [1] + [0] * (size - 1),
+                            search_seconds,
+                            contract_seconds,
+                        )
                     )
                     total_value += coeff * complex(value)
                 continue
@@ -556,36 +597,31 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                         flush=True,
                     )
                 self.parallel_stats.append(
-                    {
-                        "term_index": term_index,
-                        "term_factors": tuple(factors),
-                        "path_cost": path_cost,
-                        "search_stats": search_stats,
-                        "tree_slices": int(getattr(tree, "multiplicity", 1)),
-                        "slice_assignment": "local",
-                        "rank_slices": [int(getattr(tree, "multiplicity", 1))],
-                        "search_seconds": search_seconds,
-                        "contract_seconds": contract_seconds,
-                        "search_workers": search_workers,
-                        "search_repeats": search_repeats,
-                        "search_time": search_time,
-                        "search_backend": search_backend or method,
-                        "dask_address": dask_address,
-                        "numa_domain": getattr(self, "numa_domain", None),
-                    }
+                    term_stats(
+                        term_index,
+                        factors,
+                        path_cost,
+                        search_stats,
+                        int(getattr(tree, "multiplicity", 1)),
+                        "local",
+                        [int(getattr(tree, "multiplicity", 1))],
+                        search_seconds,
+                        contract_seconds,
+                    )
                 )
                 total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
                 continue
 
             contract_start = time.perf_counter()
             arrays = self._term_arrays(tn, backend)
+            contract_implementation = self._contract_implementation(backend)
             value, stats = parallel_contract(
                 tree,
                 arrays,
                 method="mpi",
                 comm=comm,
                 return_stats=True,
-                implementation=self.parallel_opts.get("contract_implementation"),
+                implementation=contract_implementation,
             )
             contract_seconds = time.perf_counter() - contract_start
             gathered_stats = comm.gather(stats, root=0)
@@ -598,25 +634,17 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                         flush=True,
                     )
                 self.parallel_stats.append(
-                    {
-                        "term_index": term_index,
-                        "term_factors": tuple(factors),
-                        "path_cost": path_cost,
-                        "search_stats": search_stats,
-                        "tree_slices": stats.nslices,
-                        "slice_assignment": stats.assignment,
-                        "rank_slices": [
-                            item.local_slices for item in gathered_stats
-                        ],
-                        "search_seconds": search_seconds,
-                        "contract_seconds": contract_seconds,
-                        "search_workers": search_workers,
-                        "search_repeats": search_repeats,
-                        "search_time": search_time,
-                        "search_backend": search_backend or method,
-                        "dask_address": dask_address,
-                        "numa_domain": getattr(self, "numa_domain", None),
-                    }
+                    term_stats(
+                        term_index,
+                        factors,
+                        path_cost,
+                        search_stats,
+                        stats.nslices,
+                        stats.assignment,
+                        [item.local_slices for item in gathered_stats],
+                        search_seconds,
+                        contract_seconds,
+                    )
                 )
                 total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
 
@@ -644,18 +672,20 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
 
         return np.nan if rank != 0 else float(np.real(total_value))
 
+    def _contract_implementation(self, backend):
+        implementation = self.parallel_opts.get("contract_implementation")
+        if implementation is None and backend.backend == "torch":
+            return "autoray"
+        return implementation
+
     def _contract_term_unsliced(self, tn, tree, backend):
-        contract_implementation = self.parallel_opts.get("contract_implementation")
+        contract_implementation = self._contract_implementation(backend)
         if contract_implementation == "cpp":
             if backend.backend != "torch":
                 raise ValueError("contract_implementation='cpp' requires torch backend.")
-            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
             from qibotn.torch_contractor import contract_tree_cpp
 
-            arrays = [
-                _torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
-                for array in tn.arrays
-            ]
+            arrays = arrays_to_backend(tn.arrays, "torch", dtype=self.dtype)
             nslices = int(getattr(tree, "multiplicity", 1))
             if nslices > 1:
                 total = None
@@ -666,12 +696,10 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
             return contract_tree_cpp(tree, arrays)
 
         if backend.backend == "torch":
-            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
-
             for tensor in tn.tensors:
-                tensor._data = _torch_cpu_array(
+                tensor._data = torch_cpu_array(
                     tensor._data,
-                    dtype=_torch_dtype(self.dtype),
+                    dtype=torch_dtype(self.dtype),
                 )
             return tn.contract(
                 all,
@@ -693,13 +721,9 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
         return None if user_slicing_opts is None else dict(user_slicing_opts)
 
     def _term_arrays(self, tn, backend):
-        if backend.backend == "torch":
-            from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
-
-            return [
-                _torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
-                for array in tn.arrays
-            ]
-        from qibotn.backends.quimb import _numpy_dtype
-
-        return [backend.engine.asarray(array, dtype=_numpy_dtype(self.dtype)) for array in tn.arrays]
+        return arrays_to_backend(
+            tn.arrays,
+            backend.backend,
+            engine=backend.engine,
+            dtype=self.dtype,
+        )
diff --git a/src/qibotn/backends/cutensornet_helpers.py b/src/qibotn/backends/cutensornet_helpers.py
new file mode 100644
index 0000000..1ba4511
--- /dev/null
+++ b/src/qibotn/backends/cutensornet_helpers.py
@@ -0,0 +1,321 @@
+"""cuTensorNet circuit and MPS conversion helpers."""
+
+from __future__ import annotations
+
+import numpy as np
+
+try:
+    import cupy as cp
+    import cuquantum.bindings.cutensornet as cutn
+    from cuquantum.tensornet import contract, contract_path
+    from cuquantum.tensornet.experimental import contract_decompose
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+    cutn = None
+    contract = None
+    contract_path = None
+    contract_decompose = None
+
+
+def _require_cupy():
+    if cp is None:
+        raise ImportError(
+            "The cuQuantum circuit converter requires cupy. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+    return cp
+
+
+def _require_cutensornet():
+    if cp is None or cutn is None:
+        raise ImportError(
+            "The cuQuantum MPS converter requires cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+def _require_tensornet_mps():
+    if cp is None or contract is None or contract_decompose is None:
+        raise ImportError(
+            "The cuQuantum MPS helpers require cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+def _require_contract():
+    if contract is None or contract_path is None:
+        raise ImportError(
+            "The cuQuantum MPS contraction helper requires cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+class QiboCircuitToEinsum:
+    """Convert a Qibo circuit to cuQuantum interleaved TN operands."""
+
+    def __init__(self, circuit, dtype="complex128"):
+        self.backend = _require_cupy()
+        self.dtype = getattr(self.backend, dtype)
+        self.init_basis_map(self.backend, dtype)
+        self.init_intermediate_circuit(circuit)
+        self.circuit = circuit
+
+    def state_vector_operands(self):
+        input_bitstring = "0" * len(self.active_qubits)
+        input_operands = self._get_bitstring_tensors(input_bitstring)
+        mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits(
+            self.active_qubits
+        )
+        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
+            self.gate_tensors, qubits_frontier, next_frontier
+        )
+        operands = input_operands + gate_operands
+        mode_labels += gate_mode_labels
+        out_list = [qubits_frontier[key] for key in qubits_frontier]
+        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
+        operand_exp_interleave.append(out_list)
+        return operand_exp_interleave
+
+    def _init_mode_labels_from_qubits(self, qubits):
+        nqubits = len(qubits)
+        frontier_dict = {q: i for i, q in enumerate(qubits)}
+        mode_labels = [[i] for i in range(nqubits)]
+        return mode_labels, frontier_dict, nqubits
+
+    def _get_bitstring_tensors(self, bitstring):
+        return [self.basis_map[ibit] for ibit in bitstring]
+
+    def _parse_gates_to_mode_labels_operands(self, gates, qubits_frontier, next_frontier):
+        mode_labels = []
+        operands = []
+        for tensor, gate_qubits in gates:
+            operands.append(tensor)
+            input_mode_labels = []
+            output_mode_labels = []
+            for qubit in gate_qubits:
+                input_mode_labels.append(qubits_frontier[qubit])
+                output_mode_labels.append(next_frontier)
+                qubits_frontier[qubit] = next_frontier
+                next_frontier += 1
+            mode_labels.append(output_mode_labels + input_mode_labels)
+        return mode_labels, operands
+
+    def op_shape_from_qubits(self, nqubits):
+        return (2, 2) * nqubits
+
+    def init_intermediate_circuit(self, circuit):
+        self.gate_tensors = []
+        gates_qubits = []
+        for gate in circuit.queue:
+            gate_qubits = gate.control_qubits + gate.target_qubits
+            gates_qubits.extend(gate_qubits)
+            required_shape = self.op_shape_from_qubits(len(gate_qubits))
+            self.gate_tensors.append(
+                (
+                    self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
+                        required_shape
+                    ),
+                    gate_qubits,
+                )
+            )
+        self.active_qubits = np.unique(gates_qubits)
+
+    def init_basis_map(self, backend, dtype):
+        asarray = backend.asarray
+        self.basis_map = {
+            "0": asarray([1, 0], dtype=dtype),
+            "1": asarray([0, 1], dtype=dtype),
+        }
+
+    def init_inverse_circuit(self, circuit):
+        self.gate_tensors_inverse = []
+        gates_qubits_inverse = []
+        for gate in circuit.queue:
+            gate_qubits = gate.control_qubits + gate.target_qubits
+            gates_qubits_inverse.extend(gate_qubits)
+            required_shape = self.op_shape_from_qubits(len(gate_qubits))
+            self.gate_tensors_inverse.append(
+                (self.backend.asarray(gate.matrix()).reshape(required_shape), gate_qubits)
+            )
+        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
+
+    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
+        if backend is None:
+            backend = _require_cupy()
+        asarray = backend.asarray
+        operand_map = {
+            "I": asarray([[1, 0], [0, 1]], dtype=dtype),
+            "X": asarray([[0, 1], [1, 0]], dtype=dtype),
+            "Y": asarray([[0, -1j], [1j, 0]], dtype=dtype),
+            "Z": asarray([[1, 0], [0, -1]], dtype=dtype),
+        }
+        gates = []
+        for qubit, pauli_char in pauli_map.items():
+            operand = operand_map.get(pauli_char)
+            if operand is None:
+                raise ValueError("pauli string character must be one of I/X/Y/Z")
+            gates.append((operand, (qubit,)))
+        return gates
+
+    def expectation_operands(self, ham_gates):
+        input_bitstring = "0" * self.circuit.nqubits
+        input_operands = self._get_bitstring_tensors(input_bitstring)
+        mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits(
+            range(self.circuit.nqubits)
+        )
+        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
+            self.gate_tensors, qubits_frontier, next_frontier
+        )
+        operands = input_operands + gate_operands
+        mode_labels += gate_mode_labels
+
+        self.init_inverse_circuit(self.circuit.invert())
+        next_frontier = max(qubits_frontier.values()) + 1
+        gates_inverse = ham_gates + self.gate_tensors_inverse
+        gate_mode_labels_inverse, gate_operands_inverse = (
+            self._parse_gates_to_mode_labels_operands(
+                gates_inverse, qubits_frontier, next_frontier
+            )
+        )
+        mode_labels = (
+            mode_labels
+            + gate_mode_labels_inverse
+            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
+        )
+        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
+        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
+        operand_exp_interleave.append([])
+        return operand_exp_interleave
+
+
+def initial_mps(num_qubits, dtype):
+    _require_tensornet_mps()
+    state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
+    return [state_tensor] * num_qubits
+
+
+def mps_site_right_swap(mps_tensors, i, **kwargs):
+    _require_tensornet_mps()
+    left, _, right = contract_decompose(
+        "ipj,jqk->iqj,jpk",
+        *mps_tensors[i : i + 2],
+        algorithm=kwargs.get("algorithm", None),
+        options=kwargs.get("options", None),
+    )
+    mps_tensors[i : i + 2] = (left, right)
+    return mps_tensors
+
+
+def apply_mps_gate(mps_tensors, gate, qubits, **kwargs):
+    _require_tensornet_mps()
+    n_qubits = len(qubits)
+    if n_qubits == 1:
+        site = qubits[0]
+        mps_tensors[site] = contract(
+            "ipj,qp->iqj",
+            mps_tensors[site],
+            gate,
+            options=kwargs.get("options", None),
+        )
+    elif n_qubits == 2:
+        left, right = qubits
+        if left > right:
+            return apply_mps_gate(
+                mps_tensors, gate.transpose(1, 0, 3, 2), (right, left), **kwargs
+            )
+        if left + 1 == right:
+            a_tensor, _, b_tensor = contract_decompose(
+                "ipj,jqk,rspq->irj,jsk",
+                *mps_tensors[left : left + 2],
+                gate,
+                algorithm=kwargs.get("algorithm", None),
+                options=kwargs.get("options", None),
+            )
+            mps_tensors[left : left + 2] = (a_tensor, b_tensor)
+        else:
+            mps_site_right_swap(mps_tensors, left, **kwargs)
+            apply_mps_gate(mps_tensors, gate, (left + 1, right), **kwargs)
+            mps_site_right_swap(mps_tensors, left, **kwargs)
+    else:
+        raise NotImplementedError("Only one- and two-qubit gates supported")
+
+
+class QiboCircuitToMPS:
+    """Convert a Qibo circuit to a cuTensorNet MPS representation."""
+
+    def __init__(self, circ_qibo, gate_algo, dtype="complex128", rand_seed=0):
+        _require_cutensornet()
+        np.random.seed(rand_seed)
+        cp.random.seed(rand_seed)
+        self.num_qubits = circ_qibo.nqubits
+        self.handle = cutn.create()
+        self.dtype = dtype
+        self.mps_tensors = initial_mps(self.num_qubits, dtype=dtype)
+        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
+        for gate, qubits in circuitconvertor.gate_tensors:
+            apply_mps_gate(
+                self.mps_tensors,
+                gate,
+                qubits,
+                algorithm=gate_algo,
+                options={"handle": self.handle},
+            )
+
+    def __del__(self):
+        handle = getattr(self, "handle", None)
+        if cutn is not None and handle is not None:
+            cutn.destroy(handle)
+
+
+class MPSContractionHelper:
+    """Contract cuTensorNet MPS tensors to norms, states, or expectations."""
+
+    def __init__(self, num_qubits):
+        self.num_qubits = num_qubits
+        self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)]
+        offset = 2 * num_qubits + 1
+        self.ket_modes = [
+            (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits)
+        ]
+
+    def contract_norm(self, mps_tensors, options=None):
+        interleaved_inputs = []
+        for i, tensor in enumerate(mps_tensors):
+            interleaved_inputs.extend(
+                [tensor, self.bra_modes[i], tensor.conj(), self.ket_modes[i]]
+            )
+        interleaved_inputs.append([])
+        return self._contract(interleaved_inputs, options=options).real
+
+    def contract_state_vector(self, mps_tensors, options=None):
+        interleaved_inputs = []
+        for i, tensor in enumerate(mps_tensors):
+            interleaved_inputs.extend([tensor, self.bra_modes[i]])
+        output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes])
+        interleaved_inputs.append(output_modes)
+        return self._contract(interleaved_inputs, options=options)
+
+    def contract_expectation(
+        self, mps_tensors, operator, qubits, options=None, normalize=False
+    ):
+        interleaved_inputs = []
+        extra_mode = 3 * self.num_qubits + 2
+        operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits]
+        qubits = list(qubits)
+        for i, tensor in enumerate(mps_tensors):
+            interleaved_inputs.extend([tensor, self.bra_modes[i]])
+            ket_modes = self.ket_modes[i]
+            if i in qubits:
+                ket_modes = (ket_modes[0], extra_mode, ket_modes[2])
+                operator_modes[qubits.index(i)] = extra_mode
+                extra_mode += 1
+            interleaved_inputs.extend([tensor.conj(), ket_modes])
+        interleaved_inputs.extend([operator, tuple(operator_modes)])
+        interleaved_inputs.append([])
+        norm = self.contract_norm(mps_tensors, options=options) if normalize else 1
+        return self._contract(interleaved_inputs, options=options) / norm
+
+    def _contract(self, interleaved_inputs, options=None):
+        _require_contract()
+        path = contract_path(*interleaved_inputs, options=options)[0]
+        return contract(*interleaved_inputs, options=options, optimize={"path": path})
diff --git a/src/qibotn/backends/qmatchatea.py b/src/qibotn/backends/qmatchatea.py
index 41381dc..b76424f 100644
--- a/src/qibotn/backends/qmatchatea.py
+++ b/src/qibotn/backends/qmatchatea.py
@@ -1,6 +1,9 @@
 """Implementation of Quantum Matcha Tea backend."""
 
+from __future__ import annotations
+
 import re
+import time
 from dataclasses import dataclass
 
 import numpy as np
@@ -12,6 +15,7 @@ from qibo.config import raise_error
 from qmatchatea.utils import MPISettings
 
 from qibotn.backends.abstract import QibotnBackend
+from qibotn.benchmark_cases import exact_pauli_sum
 from qibotn.observables import check_observable
 from qibotn.result import TensorNetworkResult
 
@@ -364,3 +368,207 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
             use_itpo=False,
         )
         return obs_sum
+
+
+@dataclass(frozen=True)
+class QMatchaTeaExpectationResult:
+    value: float
+    seconds: float
+    backend: object
+
+
+@dataclass(frozen=True)
+class QMatchaTeaBuildResult:
+    backend: object
+    build_seconds: float
+
+
+@dataclass(frozen=True)
+class QMatchaTeaSvdControlResult:
+    ctrl: str
+    contract_singvals: str
+    status: str
+    median_ms: float
+    min_ms: float
+    rel_error: float | None
+    kept: int | None
+    error: str
+
+
+def make_qmatchatea_backend(
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    svd_control="E!",
+    compile_circuit=True,
+    track_memory=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+):
+    backend = QMatchaTeaBackend()
+    backend.configure_tn_simulation(
+        ansatz="MPS",
+        max_bond_dimension=bond,
+        cut_ratio=cut_ratio,
+        svd_control=svd_control,
+        tensor_module=tensor_module,
+        compile_circuit=compile_circuit,
+        track_memory=track_memory,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+    )
+    return backend
+
+
+def build_qmatchatea_backend(
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    svd_control="E!",
+    compile_circuit=True,
+    track_memory=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+):
+    start = time.perf_counter()
+    backend = make_qmatchatea_backend(
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        svd_control=svd_control,
+        compile_circuit=compile_circuit,
+        track_memory=track_memory,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+    )
+    return QMatchaTeaBuildResult(backend=backend, build_seconds=time.perf_counter() - start)
+
+
+def exact_mps_expectation(circuit, observable, nqubits):
+    if isinstance(observable, dict) and "terms" in observable:
+        terms = [
+            (
+                term["coefficient"],
+                tuple((name, site) for name, site in term["operators"]),
+            )
+            for term in observable["terms"]
+        ]
+        return exact_pauli_sum(circuit, terms, nqubits)
+
+    hamiltonian = check_observable(observable, nqubits)
+    return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real)
+
+
+def run_qmatchatea_expectation(
+    circuit,
+    observable,
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    svd_control="E!",
+    compile_circuit=True,
+    preprocess=True,
+    track_memory=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+):
+    built = build_qmatchatea_backend(
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        svd_control=svd_control,
+        compile_circuit=compile_circuit,
+        track_memory=track_memory,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+    )
+    start = time.perf_counter()
+    value = built.backend.expectation(
+        circuit,
+        observable,
+        preprocess=preprocess,
+        compile_circuit=compile_circuit,
+    )
+    return QMatchaTeaExpectationResult(
+        value=float(np.real(value)),
+        seconds=time.perf_counter() - start,
+        backend=built.backend,
+    )
+
+
+def benchmark_qmatchatea_svd_control(matrix, *, ctrl, max_bond, contract_singvals, repeats):
+    import gc
+    import statistics
+
+    import torch
+
+    from qredtea.torchapi import QteaTorchTensor
+
+    conv = qmatchatea.QCConvergenceParameters(
+        max_bond_dimension=max_bond,
+        cut_ratio=0.0,
+        svd_ctrl=ctrl,
+    )
+    qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
+
+    times = []
+    rel_error = None
+    kept = None
+    status = "ok"
+    error = ""
+
+    for i in range(repeats):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        try:
+            left, right, singvals, _ = qtensor.split_svd(
+                [0],
+                [1],
+                contract_singvals=contract_singvals,
+                conv_params=conv,
+            )
+        except Exception as exc:  # noqa: BLE001
+            status = "error"
+            error = repr(exc)
+            break
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        times.append(time.perf_counter() - t0)
+
+        if i == repeats - 1:
+            left_matrix = left.elem.reshape(matrix.shape[0], -1)
+            right_matrix = right.elem.reshape(-1, matrix.shape[1])
+            recon = left_matrix @ right_matrix
+            rel_error = (
+                torch.linalg.vector_norm(matrix - recon)
+                / torch.linalg.vector_norm(matrix)
+            ).item()
+            kept = int(singvals.numel())
+
+    return QMatchaTeaSvdControlResult(
+        ctrl=ctrl,
+        contract_singvals=contract_singvals,
+        status=status,
+        median_ms=float("nan") if not times else statistics.median(times) * 1000,
+        min_ms=float("nan") if not times else min(times) * 1000,
+        rel_error=rel_error,
+        kept=kept,
+        error=error,
+    )
diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py
index 3d49b00..3d20cbb 100644
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
@@ -1,6 +1,14 @@
+import copy
+import importlib.util
+import inspect
+import json
+import time
 from collections import Counter
+from dataclasses import dataclass
+from pathlib import Path
 from typing import Optional
 
+import numpy as np
 import quimb as qu
 import quimb.tensor as qtn
 from qibo.config import raise_error
@@ -8,7 +16,39 @@ from qibo.gates.abstract import ParametrizedGate
 from qibo.models import Circuit
 
 from qibotn.backends.abstract import QibotnBackend
+from qibotn.observables import extract_gates_and_qubits
+from qibotn.parallel import contraction_tree_costs, parallel_path_search
 from qibotn.result import TensorNetworkResult
+from qibotn.torch_utils import (
+    arrays_to_backend as _arrays_to_backend,
+    numpy_dtype as _numpy_dtype,
+    torch_cpu_array as _torch_cpu_array,
+    torch_dtype as _torch_dtype,
+)
+
+
+def _real_scalar(x):
+    return float(x.real)
+
+
+def torch_contract_implementation(backend="torch", implementation=None):
+    if implementation is not None:
+        return implementation
+    return "autoray" if backend == "torch" else None
+
+
+def _quimb_should_parametrize(gate):
+    """Use quimb parametrized tensors only for non-plain numeric parameters."""
+    if not isinstance(gate, ParametrizedGate) or not getattr(gate, "trainable", True):
+        return False
+    for param in getattr(gate, "parameters", ()):
+        if isinstance(param, (int, float, complex, np.number)):
+            continue
+        if isinstance(param, np.ndarray) and param.ndim == 0:
+            continue
+        return True
+    return False
+
 
 GATE_MAP = {
     "h": "H",
@@ -20,6 +60,9 @@ GATE_MAP = {
     "rx": "RX",
     "ry": "RY",
     "rz": "RZ",
+    "rxx": "RXX",
+    "ryy": "RYY",
+    "rzz": "RZZ",
     "u3": "U3",
     "cx": "CX",
     "cnot": "CNOT",
@@ -40,50 +83,6 @@ GATE_MAP = {
 PAULI_DENSE_MAX_QUBITS = 8
 
 
-def _torch_cpu_array(data, dtype=None):
-    """Convert array-like data to a contiguous CPU torch tensor."""
-    import numpy as np
-    import torch
-
-    if isinstance(data, torch.Tensor):
-        x = data
-    else:
-        array = np.asarray(data)
-        if any(stride < 0 for stride in array.strides):
-            array = np.ascontiguousarray(array)
-        x = torch.from_numpy(array)
-
-    if x.device.type != "cpu":
-        x = x.cpu()
-    if dtype is not None and x.dtype != dtype:
-        x = x.to(dtype)
-    if not x.is_contiguous():
-        x = x.contiguous()
-    return x
-
-
-def _torch_dtype(dtype):
-    import torch
-
-    if dtype in ("complex64", "single"):
-        return torch.complex64
-    return torch.complex128
-
-
-def _numpy_dtype(dtype):
-    import numpy as np
-
-    if dtype in ("complex64", "single"):
-        return np.complex64
-    return np.complex128
-
-
-def _arrays_to_backend(arrays, backend, engine, dtype="complex128"):
-    if backend == "torch":
-        return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in arrays]
-    return [engine.asarray(array, dtype=_numpy_dtype(dtype)) for array in arrays]
-
-
 def _pauli_term_to_dense_operator(factors):
     op = None
     where = []
@@ -101,45 +100,54 @@ def pauli_product_expectation_tn(
     simplify_atol=1e-12,
     simplify_equalize_norms=True,
 ):
-    """Build the scalar TN for ``<psi|P|psi>`` without dense Pauli strings."""
+    """Build the scalar TN for ``<psi|P|psi>`` without dense Pauli strings.
+
+    Use quimb's reverse-lightcone reduced-density TN for the Pauli support,
+    then attach one 2x2 operator tensor per acted-on site. This keeps long
+    Pauli products sparse without adding identity tensors outside the support.
+    """
     import numpy as np
+    from autoray import infer_backend
 
     op_by_site = {
         int(qubit): qu.pauli(str(gate_name).lower())
         for qubit, gate_name in factors
         if str(gate_name).upper() != "I"
     }
-    ket = quimb_circuit.get_psi_simplified(
-        seq=simplify_sequence,
-        atol=simplify_atol,
-        equalize_norms=simplify_equalize_norms,
-    )
-    bra = ket.conj().reindex(
-        {
-            quimb_circuit.ket_site_ind(qubit): quimb_circuit.bra_site_ind(qubit)
-            for qubit in range(quimb_circuit.N)
-        }
-    )
+    if not op_by_site:
+        return qtn.TensorNetwork(
+            [qtn.Tensor(data=np.asarray(1.0 + 0.0j), inds=())]
+        )
 
-    tn = bra | ket
-    identity = np.eye(2, dtype=complex)
-    for qubit in range(quimb_circuit.N):
-        data = op_by_site.get(qubit, identity)
-        tn |= qtn.Tensor(
-            data=data,
+    where = tuple(sorted(op_by_site))
+    fs_opts = {
+        "seq": simplify_sequence,
+        "atol": simplify_atol,
+        "equalize_norms": simplify_equalize_norms,
+    }
+    rho = quimb_circuit.get_rdm_lightcone_simplified(
+        where=where,
+        **fs_opts,
+    )
+    rho_backend = infer_backend(rho.tensors[0].data) if rho.tensors else "numpy"
+    for qubit in where:
+        op = op_by_site[qubit]
+        if rho_backend == "torch":
+            dtype = getattr(quimb_circuit, "dtype", None) or "complex128"
+            op = _torch_cpu_array(op, dtype=_torch_dtype(dtype))
+        rho |= qtn.Tensor(
+            data=op,
             inds=(
                 quimb_circuit.bra_site_ind(qubit),
                 quimb_circuit.ket_site_ind(qubit),
             ),
         )
 
-    tn.full_simplify_(
+    rho.full_simplify_(
         output_inds=(),
-        seq=simplify_sequence,
-        atol=simplify_atol,
-        equalize_norms=simplify_equalize_norms,
+        **fs_opts,
     )
-    return tn
+    return rho
 
 
 def pauli_product_expectation(
@@ -156,7 +164,13 @@ def pauli_product_expectation(
         simplify_sequence=simplify_sequence,
         simplify_atol=simplify_atol,
     )
-    return tn.contract(all, output_inds=(), optimize=optimize, backend=backend)
+    return tn.contract(
+        all,
+        output_inds=(),
+        optimize=optimize,
+        backend=backend,
+        implementation=torch_contract_implementation(backend),
+    )
 
 
 def __init__(self, quimb_backend="torch", contraction_optimizer="auto-hq"):
@@ -287,8 +301,14 @@ def execute_circuit(
     elif initial_state is not None:
         raise_error(ValueError, "Initial state not None supported only for MPS ansatz.")
 
+    gate_opts = {
+        "max_bond": self.max_bond_dimension,
+        "cutoff": self.svd_cutoff,
+    }
     circ_quimb = self.circuit_ansatz.from_openqasm2_str(
-        circuit.to_qasm(), psi0=initial_state, gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff}
+        circuit.to_qasm(),
+        psi0=initial_state,
+        gate_opts=gate_opts,
     )
 
     if nshots:
@@ -390,7 +410,7 @@ def exp_value_observable_symbolic(
 
         expectation_value = expectation_value + coeff * exp_values
 
-    return self.real(expectation_value)
+    return _real_scalar(expectation_value)
 
 
 def _qibo_circuit_to_quimb(
@@ -414,7 +434,19 @@ def _qibo_circuit_to_quimb(
         The converted circuit.
     """
     nqubits = qibo_circ.nqubits
+    merge_1q = circuit_kwargs.pop("merge_1q", "auto")
+    merge_2q = circuit_kwargs.pop("merge_2q", "auto")
+    if self.backend == "torch":
+        circuit_kwargs.setdefault("to_backend", _torch_cpu_array)
+        circuit_kwargs.setdefault("convert_eager", True)
+        circuit_kwargs.setdefault("dtype", getattr(self, "dtype", "complex128"))
     circ = quimb_circuit_type(nqubits, **circuit_kwargs)
+    pending_gates = []
+
+    def flush_pending_gates():
+        if pending_gates:
+            circ.apply_gates(pending_gates, merge_1q=merge_1q, merge_2q=merge_2q)
+            pending_gates.clear()
 
     for gate in qibo_circ.queue:
         gate_name = getattr(gate, "name", None)
@@ -424,34 +456,39 @@ def _qibo_circuit_to_quimb(
         if gate_name == "cu1":
             theta = gate.parameters[0]
             c, t = gate.qubits
-            circ.apply_gate("RZ", theta / 2, c)
-            circ.apply_gate("RZ", theta / 2, t)
-            circ.apply_gate("CNOT", c, t)
-            circ.apply_gate("RZ", -theta / 2, t)
-            circ.apply_gate("CNOT", c, t)
+            pending_gates.extend(
+                (
+                    ("RZ", theta / 2, c),
+                    ("RZ", theta / 2, t),
+                    ("CNOT", c, t),
+                    ("RZ", -theta / 2, t),
+                    ("CNOT", c, t),
+                )
+            )
             continue
         if quimb_gate_name is None:
             if hasattr(gate, "matrix"):
-                circ.apply_gate_raw(gate.matrix(), getattr(gate, "qubits", ()))
+                pending_gates.append((gate.matrix(), *getattr(gate, "qubits", ())))
                 continue
             raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.")
 
         params = getattr(gate, "parameters", ())
         qubits = getattr(gate, "qubits", ())
 
-        is_parametrized = isinstance(gate, ParametrizedGate) and getattr(
-            gate, "trainable", True
-        )
+        is_parametrized = _quimb_should_parametrize(gate)
         if is_parametrized:
-            circ.apply_gate(
-                quimb_gate_name, *params, *qubits, parametrized=is_parametrized
-            )
-        else:
+            flush_pending_gates()
             circ.apply_gate(
                 quimb_gate_name,
                 *params,
                 *qubits,
+                parametrize=True,
             )
+            continue
+
+        pending_gates.append((quimb_gate_name, *params, *qubits))
+
+    flush_pending_gates()
     return circ
 
 
@@ -509,7 +546,6 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
 
     if parallel is None:
         # Use original implementation
-        from qibotn.observables import extract_gates_and_qubits
         all_terms = extract_gates_and_qubits(observable)
 
         qc = self._qibo_circuit_to_quimb(
@@ -532,7 +568,8 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
             else:
                 op, where = _pauli_term_to_dense_operator(factors)
                 val = qc.local_expectation(
-                    op, where,
+                    op,
+                    where,
                     backend=self.backend,
                     optimize=self.contractions_optimizer,
                     simplify_sequence="ADCRS",
@@ -540,7 +577,7 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
                 )
             exp_val += coeff * val
 
-        return self.real(exp_val)
+        return _real_scalar(exp_val)
 
     else:
         # Use parallel implementation
@@ -549,10 +586,11 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None):
 
 def _expectation_parallel(self, circuit, observable, method, opts):
     """Parallel expectation value computation."""
-    from qibotn.observables import extract_gates_and_qubits
-    from qibotn.parallel import parallel_path_search, parallel_contract
     import torch
 
+    from qibotn.observables import extract_gates_and_qubits
+    from qibotn.parallel import parallel_contract, parallel_path_search
+
     try:
         from mpi4py import MPI
         comm = MPI.COMM_WORLD if method == 'mpi' else None
@@ -568,11 +606,16 @@ def _expectation_parallel(self, circuit, observable, method, opts):
     torch_threads = opts.get('torch_threads', None)
     slicing_opts = opts.get('slicing_opts', None)
     trial_timeout = opts.get('trial_timeout', None)
+    search_seed = opts.get('search_seed', 0)
+    merge_1q = opts.get("merge_1q", "auto")
+    merge_2q = opts.get("merge_2q", "auto")
 
     qc = self._qibo_circuit_to_quimb(
         circuit,
         quimb_circuit_type=self.circuit_ansatz,
         gate_opts={"max_bond": self.max_bond_dimension, "cutoff": self.svd_cutoff},
+        merge_1q=merge_1q,
+        merge_2q=merge_2q,
     )
 
     all_terms = extract_gates_and_qubits(observable)
@@ -599,6 +642,7 @@ def _expectation_parallel(self, circuit, observable, method, opts):
             n_workers=search_workers,
             slicing_opts=slicing_opts,
             trial_timeout=trial_timeout,
+            search_seed=search_seed,
         )
 
         if tree is None:
@@ -611,7 +655,8 @@ def _expectation_parallel(self, circuit, observable, method, opts):
             if self.backend == "torch":
                 for tensor in tn.tensors:
                     tensor._data = _torch_cpu_array(
-                        tensor._data, dtype=torch.complex128
+                        tensor._data,
+                        dtype=_torch_dtype(getattr(self, "dtype", "complex128")),
                     )
                 val = complex(
                     tn.contract(
@@ -619,6 +664,7 @@ def _expectation_parallel(self, circuit, observable, method, opts):
                         output_inds=(),
                         optimize=tree,
                         backend="torch",
+                        implementation=torch_contract_implementation(self.backend),
                     )
                 )
             else:
@@ -637,10 +683,10 @@ def _expectation_parallel(self, circuit, observable, method, opts):
         all_exp = comm.gather(my_exp, root=0)
         if rank == 0:
             total_exp = sum(all_exp)
-            return self.real(total_exp)
+            return _real_scalar(total_exp)
         return 0.0
 
-    return self.real(my_exp)
+    return _real_scalar(my_exp)
 
 
 CLASSES_ROOTS = {"numpy": "Numpy", "torch": "PyTorch", "jax": "Jax"}
@@ -701,3 +747,876 @@ def __getattr__(name):
         return BACKENDS[name]
     except KeyError:
         raise AttributeError(f"module {__name__!r} has no attribute {name!r}") from None
+
+
+@dataclass(frozen=True)
+class CircuitBuildResult:
+    quimb_circuit: object
+    build_seconds: float
+
+
+@dataclass(frozen=True)
+class ExpectationTN:
+    coeff: complex
+    factors: tuple
+    tn: object
+    quimb_circuit: object
+    build_seconds: float
+    tn_seconds: float
+
+
+@dataclass(frozen=True)
+class TreeSearchResult:
+    tree: object
+    seconds: float
+    costs: dict
+    stats: dict
+
+
+@dataclass(frozen=True)
+class QuimbTorchRunResult:
+    built: ExpectationTN
+    search: TreeSearchResult
+    value: complex
+    contract_seconds: float
+
+
+@dataclass(frozen=True)
+class QuimbCircuitStats:
+    build_seconds: float
+    num_gates: int
+    num_tensors: int
+    num_indices: int
+
+
+@dataclass(frozen=True)
+class QuimbTNProfile:
+    value: complex
+    build_seconds: float
+    expectation_tn_seconds: float
+    search_seconds: float
+    contract_seconds: float
+    circuit_num_gates: int
+    circuit_num_tensors: int
+    tn_num_tensors: int
+    tn_num_indices: int
+    tn_outer_indices: int
+    search_costs: dict
+    search_stats: dict
+
+
+@dataclass(frozen=True)
+class QuimbContractTiming:
+    implementation: str | None
+    sort_indices: bool
+    value: complex
+    best_seconds: float
+    mean_seconds: float
+
+
+@dataclass(frozen=True)
+class QuimbGateMergeComparison:
+    merge_stats: QuimbCircuitStats
+    nomerge_stats: QuimbCircuitStats
+    tensor_reduction: float
+    build_speedup: float
+
+
+@dataclass(frozen=True)
+class QuimbGateMergeExpectationComparison:
+    merge: QuimbTorchRunResult
+    nomerge: QuimbTorchRunResult
+    value_diff: float
+    total_speedup: float
+    build_speedup: float
+    tensor_reduction: float
+
+
+def make_quimb_backend(
+    *,
+    quimb_backend="torch",
+    contraction_optimizer="auto-hq",
+    dtype="complex128",
+):
+    backend = BACKENDS[quimb_backend](
+        quimb_backend=quimb_backend,
+        contraction_optimizer=contraction_optimizer,
+    )
+    backend.dtype = dtype
+    return backend
+
+
+def torch_quimb_backend(dtype="complex128", contraction_optimizer="auto-hq"):
+    return make_quimb_backend(
+        quimb_backend="torch",
+        contraction_optimizer=contraction_optimizer,
+        dtype=dtype,
+    )
+
+
+def build_quimb_backend_circuit(
+    circuit,
+    *,
+    quimb_backend="torch",
+    ansatz="tn",
+    dtype="complex128",
+    max_bond=None,
+    cutoff=1e-12,
+    merge_1q="auto",
+    merge_2q="auto",
+    contraction_optimizer="auto-hq",
+):
+    backend = make_quimb_backend(
+        quimb_backend=quimb_backend,
+        contraction_optimizer=contraction_optimizer,
+        dtype=dtype,
+    )
+    start = time.perf_counter()
+    backend.configure_tn_simulation(
+        ansatz="mps" if ansatz == "mps" else None,
+        max_bond_dimension=max_bond,
+        svd_cutoff=cutoff,
+    )
+    qc = backend._qibo_circuit_to_quimb(
+        circuit,
+        quimb_circuit_type=backend.circuit_ansatz,
+        gate_opts={"max_bond": max_bond, "cutoff": cutoff},
+        dtype=dtype,
+        merge_1q=merge_1q,
+        merge_2q=merge_2q,
+    )
+    return CircuitBuildResult(qc, time.perf_counter() - start)
+
+
+def quimb_circuit_stats(quimb_circuit, build_seconds=0.0):
+    return QuimbCircuitStats(
+        build_seconds=float(build_seconds),
+        num_gates=int(getattr(quimb_circuit, "num_gates", 0)),
+        num_tensors=len(quimb_circuit.psi.tensor_map),
+        num_indices=len(quimb_circuit.psi.ind_map),
+    )
+
+
+def build_quimb_circuit_stats(circuit, **kwargs):
+    built = build_quimb_backend_circuit(circuit, **kwargs)
+    return quimb_circuit_stats(built.quimb_circuit, built.build_seconds)
+
+
+def compare_quimb_gate_merge(circuit, **kwargs):
+    merge_kwargs = dict(kwargs)
+    nomerge_kwargs = dict(kwargs)
+    merge_kwargs.update({"merge_1q": True, "merge_2q": True})
+    nomerge_kwargs.update({"merge_1q": False, "merge_2q": False})
+    merge_stats = build_quimb_circuit_stats(circuit, **merge_kwargs)
+    nomerge_stats = build_quimb_circuit_stats(circuit, **nomerge_kwargs)
+    tensor_reduction = (
+        float(nomerge_stats.num_tensors) / max(float(merge_stats.num_tensors), 1.0)
+    )
+    build_speedup = (
+        float(nomerge_stats.build_seconds) / max(float(merge_stats.build_seconds), 1e-15)
+    )
+    return QuimbGateMergeComparison(
+        merge_stats=merge_stats,
+        nomerge_stats=nomerge_stats,
+        tensor_reduction=tensor_reduction,
+        build_speedup=build_speedup,
+    )
+
+
+def build_quimb_torch_circuit(
+    circuit,
+    *,
+    ansatz="tn",
+    dtype="complex128",
+    max_bond=None,
+    cutoff=1e-12,
+    merge_1q="auto",
+    merge_2q="auto",
+    contraction_optimizer="auto-hq",
+):
+    return build_quimb_backend_circuit(
+        circuit,
+        quimb_backend="torch",
+        ansatz=ansatz,
+        dtype=dtype,
+        max_bond=max_bond,
+        cutoff=cutoff,
+        merge_1q=merge_1q,
+        merge_2q=merge_2q,
+        contraction_optimizer=contraction_optimizer,
+    )
+
+
+def qibo_circuit_to_quimb_torch(
+    circuit,
+    *,
+    ansatz="tn",
+    dtype="complex128",
+    max_bond=None,
+    cutoff=1e-12,
+    merge_1q="auto",
+    merge_2q="auto",
+    contraction_optimizer="auto-hq",
+):
+    return build_quimb_torch_circuit(
+        circuit,
+        ansatz=ansatz,
+        dtype=dtype,
+        max_bond=max_bond,
+        cutoff=cutoff,
+        merge_1q=merge_1q,
+        merge_2q=merge_2q,
+        contraction_optimizer=contraction_optimizer,
+    ).quimb_circuit
+
+
+def pauli_term_expectation_tn(
+    quimb_circuit,
+    factors,
+    *,
+    dtype="complex128",
+    simplify_sequence="ADCRS",
+    simplify_atol=1e-12,
+):
+    if len(factors) > PAULI_DENSE_MAX_QUBITS:
+        tn = pauli_product_expectation_tn(
+            quimb_circuit,
+            factors,
+            simplify_sequence=simplify_sequence,
+            simplify_atol=simplify_atol,
+        )
+    else:
+        op, where = _pauli_term_to_dense_operator(factors)
+        op = _torch_cpu_array(op, dtype=_torch_dtype(dtype))
+        tn = quimb_circuit.local_expectation(
+            op,
+            where,
+            rehearse="tn",
+            simplify_sequence=simplify_sequence,
+            simplify_atol=simplify_atol,
+        )
+    ensure_torch_tn(tn, dtype=dtype)
+    return tn
+
+
+def build_expectation_tn(
+    circuit,
+    observable,
+    *,
+    term_index=0,
+    ansatz="tn",
+    dtype="complex128",
+    max_bond=None,
+    cutoff=1e-12,
+    merge_1q="auto",
+    merge_2q="auto",
+    contraction_optimizer="auto-hq",
+):
+    terms = extract_gates_and_qubits(observable)
+    coeff, factors = terms[term_index]
+    built = build_quimb_torch_circuit(
+        circuit,
+        ansatz=ansatz,
+        dtype=dtype,
+        max_bond=max_bond,
+        cutoff=cutoff,
+        merge_1q=merge_1q,
+        merge_2q=merge_2q,
+        contraction_optimizer=contraction_optimizer,
+    )
+    start = time.perf_counter()
+    tn = pauli_term_expectation_tn(built.quimb_circuit, factors, dtype=dtype)
+    return ExpectationTN(
+        coeff=coeff,
+        factors=tuple(factors),
+        tn=tn,
+        quimb_circuit=built.quimb_circuit,
+        build_seconds=built.build_seconds,
+        tn_seconds=time.perf_counter() - start,
+    )
+
+
+def ensure_torch_tn(tn, dtype="complex128"):
+    target_dtype = _torch_dtype(dtype)
+    for tensor in tn.tensors:
+        tensor._data = _torch_cpu_array(tensor._data, dtype=target_dtype)
+    return tn
+
+
+def term_arrays(tn, dtype="complex128"):
+    return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in tn.arrays]
+
+
+def search_contraction_tree(
+    tn,
+    *,
+    method="processpool",
+    total_repeats=128,
+    max_time=60,
+    n_workers=4,
+    slicing_opts=None,
+    trial_timeout=None,
+    search_backend=None,
+    dask_address=None,
+    debug_trials=False,
+    dask_close_workers=False,
+    expected_workers=None,
+    search_seed=0,
+    sort_indices=False,
+    sort_priority="flops",
+    dtype="complex128",
+):
+    start = time.perf_counter()
+    tree = parallel_path_search(
+        tn,
+        tn.outer_inds(),
+        method=method,
+        total_repeats=total_repeats,
+        max_time=max_time,
+        n_workers=n_workers,
+        slicing_opts=slicing_opts,
+        trial_timeout=trial_timeout,
+        search_backend=search_backend,
+        dask_address=dask_address,
+        debug_trials=debug_trials,
+        dask_close_workers=dask_close_workers,
+        expected_workers=expected_workers,
+        search_seed=search_seed,
+    )
+    if sort_indices and hasattr(tree, "sort_contraction_indices"):
+        tree.sort_contraction_indices(
+            priority=sort_priority,
+            make_output_contig=True,
+            make_contracted_contig=True,
+            reset=True,
+        )
+    costs = contraction_tree_costs(
+        tree,
+        dtype_bytes=8 if dtype in ("complex64", "single", np.complex64) else 16,
+    )
+    return TreeSearchResult(
+        tree=tree,
+        seconds=time.perf_counter() - start,
+        costs=costs,
+        stats=getattr(tree, "qibotn_search_stats", {}) or {},
+    )
+
+
+def sorted_tree(tree, enabled=True, priority="flops"):
+    work_tree = copy.deepcopy(tree)
+    if enabled and hasattr(work_tree, "sort_contraction_indices"):
+        work_tree.sort_contraction_indices(
+            priority=priority,
+            make_output_contig=True,
+            make_contracted_contig=True,
+            reset=True,
+        )
+    return work_tree
+
+
+def contract_tn(
+    tn,
+    tree,
+    *,
+    dtype="complex128",
+    backend="torch",
+    implementation=None,
+):
+    if backend == "torch":
+        ensure_torch_tn(tn, dtype=dtype)
+    return tn.contract(
+        all,
+        output_inds=(),
+        optimize=tree,
+        backend=backend,
+        implementation=torch_contract_implementation(backend, implementation),
+    )
+
+
+def run_quimb_backend_expectation(
+    circuit,
+    observable,
+    *,
+    quimb_backend="torch",
+    ansatz="tn",
+    dtype="complex128",
+    max_bond=None,
+    cutoff=1e-12,
+    contraction_optimizer="auto-hq",
+):
+    backend = make_quimb_backend(
+        quimb_backend=quimb_backend,
+        contraction_optimizer=contraction_optimizer,
+        dtype=dtype,
+    )
+    backend.configure_tn_simulation(
+        ansatz="mps" if ansatz == "mps" else None,
+        max_bond_dimension=max_bond,
+        svd_cutoff=cutoff,
+    )
+    start = time.perf_counter()
+    value = backend.expectation(circuit, observable)
+    return value, time.perf_counter() - start
+
+
+def run_quimb_torch_expectation(
+    circuit,
+    observable,
+    *,
+    term_index=0,
+    ansatz="tn",
+    dtype="complex128",
+    max_bond=None,
+    cutoff=1e-12,
+    merge_1q="auto",
+    merge_2q="auto",
+    contraction_optimizer="auto-hq",
+    search_method="processpool",
+    total_repeats=128,
+    max_time=60,
+    n_workers=4,
+    slicing_opts=None,
+    trial_timeout=None,
+    search_backend=None,
+    dask_address=None,
+    debug_trials=False,
+    dask_close_workers=False,
+    expected_workers=None,
+    search_seed=0,
+    sort_indices=False,
+    sort_priority="flops",
+    contract_backend="torch",
+    contract_implementation=None,
+):
+    built = build_expectation_tn(
+        circuit,
+        observable,
+        term_index=term_index,
+        ansatz=ansatz,
+        dtype=dtype,
+        max_bond=max_bond,
+        cutoff=cutoff,
+        merge_1q=merge_1q,
+        merge_2q=merge_2q,
+        contraction_optimizer=contraction_optimizer,
+    )
+    search = search_contraction_tree(
+        built.tn,
+        method=search_method,
+        total_repeats=total_repeats,
+        max_time=max_time,
+        n_workers=n_workers,
+        slicing_opts=slicing_opts,
+        trial_timeout=trial_timeout,
+        search_backend=search_backend,
+        dask_address=dask_address,
+        debug_trials=debug_trials,
+        dask_close_workers=dask_close_workers,
+        expected_workers=expected_workers,
+        search_seed=search_seed,
+        sort_indices=sort_indices,
+        sort_priority=sort_priority,
+        dtype=dtype,
+    )
+    start = time.perf_counter()
+    value = contract_tn(
+        built.tn,
+        search.tree,
+        dtype=dtype,
+        backend=contract_backend,
+        implementation=contract_implementation,
+    )
+    return QuimbTorchRunResult(
+        built=built,
+        search=search,
+        value=built.coeff * complex(value),
+        contract_seconds=time.perf_counter() - start,
+    )
+
+
+def profile_quimb_torch_expectation(circuit, observable, **kwargs):
+    result = run_quimb_torch_expectation(circuit, observable, **kwargs)
+    return QuimbTNProfile(
+        value=result.value,
+        build_seconds=result.built.build_seconds,
+        expectation_tn_seconds=result.built.tn_seconds,
+        search_seconds=result.search.seconds,
+        contract_seconds=result.contract_seconds,
+        circuit_num_gates=int(result.built.quimb_circuit.num_gates),
+        circuit_num_tensors=len(result.built.quimb_circuit.psi.tensor_map),
+        tn_num_tensors=len(result.built.tn.tensor_map),
+        tn_num_indices=len(result.built.tn.ind_map),
+        tn_outer_indices=len(result.built.tn.outer_inds()),
+        search_costs=result.search.costs,
+        search_stats=result.search.stats,
+    )
+
+
+def compare_quimb_gate_merge_expectation(circuit, observable, **kwargs):
+    """Run the quimb+torch expectation pipeline with gate merging on and off.
+
+    Each variant builds its own tensor network and contraction tree.  Trees are
+    structure-specific, so callers should compare the returned ``merge`` and
+    ``nomerge`` results rather than reusing a tree between variants.
+    """
+    merge_kwargs = dict(kwargs)
+    nomerge_kwargs = dict(kwargs)
+    merge_kwargs.update({"merge_1q": True, "merge_2q": True})
+    nomerge_kwargs.update({"merge_1q": False, "merge_2q": False})
+    nomerge = run_quimb_torch_expectation(circuit, observable, **nomerge_kwargs)
+    merge = run_quimb_torch_expectation(circuit, observable, **merge_kwargs)
+
+    merge_total = (
+        merge.built.build_seconds + merge.built.tn_seconds + merge.search.seconds
+        + merge.contract_seconds
+    )
+    nomerge_total = (
+        nomerge.built.build_seconds + nomerge.built.tn_seconds
+        + nomerge.search.seconds + nomerge.contract_seconds
+    )
+    return QuimbGateMergeExpectationComparison(
+        merge=merge,
+        nomerge=nomerge,
+        value_diff=abs(merge.value - nomerge.value),
+        total_speedup=nomerge_total / max(merge_total, 1e-15),
+        build_speedup=nomerge.built.build_seconds / max(merge.built.build_seconds, 1e-15),
+        tensor_reduction=(
+            len(nomerge.built.quimb_circuit.psi.tensor_map)
+            / max(len(merge.built.quimb_circuit.psi.tensor_map), 1)
+        ),
+    )
+
+
+def time_quimb_contract_implementations(
+    expectation_tn,
+    tree,
+    *,
+    dtype="complex128",
+    implementations=("autoray", "cotengra"),
+    sort_options=(False, True),
+    repeats=3,
+):
+    timings = []
+    for sort_indices in sort_options:
+        work_tree = sorted_tree(tree, sort_indices)
+        for implementation in implementations:
+            value = None
+            samples = []
+            for _ in range(repeats):
+                start = time.perf_counter()
+                value = contract_tn(
+                    expectation_tn,
+                    work_tree,
+                    dtype=dtype,
+                    implementation=implementation,
+                )
+                samples.append(time.perf_counter() - start)
+            timings.append(
+                QuimbContractTiming(
+                    implementation=implementation,
+                    sort_indices=bool(sort_indices),
+                    value=complex(value),
+                    best_seconds=min(samples),
+                    mean_seconds=sum(samples) / len(samples),
+                )
+            )
+    return tuple(timings)
+
+
+def quimb_torch_parallel_opts(
+    *,
+    target_slices=None,
+    target_size=None,
+    search_workers=None,
+    torch_threads=1,
+    search_repeats=128,
+    search_time=60.0,
+    search_seed=0,
+    merge_gates=True,
+    search_backend="processpool",
+    dask_address=None,
+    dask_expected_workers=None,
+    dask_close_workers=False,
+    debug_trials=False,
+    search_only=False,
+    save_tree_path=None,
+    load_tree_path=None,
+    print_stats=False,
+):
+    slicing_opts = {}
+    if target_slices is not None:
+        slicing_opts["target_slices"] = target_slices
+    if target_size is not None:
+        slicing_opts["target_size"] = target_size
+
+    opts = {
+        "slicing_opts": slicing_opts or None,
+        "search_workers": search_workers or torch_threads,
+        "max_repeats": search_repeats,
+        "max_time": search_time,
+        "search_seed": search_seed,
+        "merge_1q": merge_gates,
+        "merge_2q": merge_gates,
+        "print_stats": print_stats,
+    }
+    if search_backend is not None:
+        opts["search_backend"] = search_backend
+    if dask_address is not None:
+        opts["dask_address"] = dask_address
+    if dask_expected_workers is not None:
+        opts["dask_expected_workers"] = dask_expected_workers
+    if dask_close_workers:
+        opts["dask_close_workers"] = True
+    if debug_trials:
+        opts["debug_trials"] = True
+    if search_only:
+        opts["search_only"] = True
+        opts["save_tree_path"] = save_tree_path
+    elif load_tree_path is not None:
+        opts["load_tree_path"] = load_tree_path
+    return opts
+
+
+def load_custom_case_module(path):
+    """Load a user-provided Python module with circuit/observable builders."""
+    path = Path(path).resolve()
+    spec = importlib.util.spec_from_file_location(path.stem, path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Cannot import case module from {path}.")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+def _call_builder(fn, **kwargs):
+    sig = inspect.signature(fn)
+    if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
+        return fn(**kwargs)
+    return fn(**{name: value for name, value in kwargs.items() if name in sig.parameters})
+
+
+def load_custom_observable(
+    module,
+    *,
+    nqubits,
+    nlayers=0,
+    seed=42,
+    pauli_pattern=None,
+    observable_json=None,
+):
+    """Load an observable from a custom module, JSON file, or Pauli pattern."""
+    if pauli_pattern:
+        return {"pauli_string_pattern": pauli_pattern}
+    if observable_json:
+        with Path(observable_json).open(encoding="utf-8") as f:
+            return json.load(f)
+    if hasattr(module, "build_observable"):
+        return _call_builder(
+            module.build_observable,
+            nqubits=nqubits,
+            nlayers=nlayers,
+            seed=seed,
+        )
+    if hasattr(module, "OBSERVABLE"):
+        return module.OBSERVABLE
+    raise ValueError(
+        "No observable supplied. Define build_observable/OBSERVABLE in the case "
+        "module, or pass pauli_pattern / observable_json."
+    )
+
+
+def run_custom_tn_expectation(
+    case_module,
+    *,
+    nqubits,
+    nlayers=0,
+    seed=42,
+    observable=None,
+    pauli_pattern=None,
+    observable_json=None,
+    mpi=False,
+    exact=False,
+    exact_max_qubits=24,
+    bond=1024,
+    cut_ratio=1e-12,
+    torch_threads=8,
+    quimb_backend="torch",
+    dtype="complex128",
+    parallel_opts=None,
+):
+    """Run a quimb+torch TN expectation for a custom circuit module."""
+    from qibotn.expectation_runner import (
+        ExpectationConfig,
+        exact_for_observable,
+        run_cpu_expectation,
+    )
+
+    module = load_custom_case_module(case_module)
+    if not hasattr(module, "build_circuit"):
+        raise ValueError("case_module must define build_circuit.")
+
+    circuit = _call_builder(
+        module.build_circuit,
+        nqubits=nqubits,
+        nlayers=nlayers,
+        seed=seed,
+    )
+    if observable is None:
+        observable = load_custom_observable(
+            module,
+            nqubits=nqubits,
+            nlayers=nlayers,
+            seed=seed,
+            pauli_pattern=pauli_pattern,
+            observable_json=observable_json,
+        )
+
+    rank = 0
+    if mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    exact_value = None
+    if exact and rank == 0:
+        if nqubits > exact_max_qubits:
+            raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.")
+        exact_value = exact_for_observable(circuit, observable, nqubits)
+
+    config = ExpectationConfig(
+        ansatz="tn",
+        mpi=mpi,
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module="torch",
+        quimb_backend=quimb_backend,
+        dtype=dtype,
+        torch_threads=torch_threads,
+        parallel_opts=parallel_opts or {},
+    )
+    result = run_cpu_expectation(circuit, observable, config)
+    if mpi and result.rank != 0:
+        return None
+    return {
+        "circuit": circuit,
+        "observable": observable,
+        "exact": exact_value,
+        "result": result,
+        "abs_error": None if exact_value is None else abs(result.value - exact_value),
+        "rel_error": (
+            None
+            if exact_value is None
+            else abs(result.value - exact_value) / max(abs(exact_value), 1e-15)
+        ),
+    }
+
+
+def run_contest_tn_case(
+    case_name,
+    obs_name,
+    *,
+    mode="contract",
+    tree_dir="trees/contest_tn",
+    nqubits=None,
+    nlayers=None,
+    seed=None,
+    mpi=False,
+    exact=False,
+    exact_max_qubits=24,
+    bond=1024,
+    cut_ratio=1e-12,
+    torch_threads=8,
+    quimb_backend="torch",
+    dtype="complex64",
+    target_slices=None,
+    target_size=2**34,
+    search_workers=None,
+    search_repeats=2048,
+    search_time=300.0,
+    search_seed=0,
+    merge_gates=True,
+    search_backend="dask",
+    dask_address=None,
+    dask_expected_workers=None,
+    dask_close_workers=False,
+    debug_trials=False,
+):
+    """Run one shared contest-style quimb+torch TN search/contract case."""
+    from qibotn.contest_cases import CASES, build_contest_circuit, build_contest_observable, tree_path
+    from qibotn.expectation_runner import (
+        ExpectationConfig,
+        exact_for_observable,
+        run_cpu_expectation,
+    )
+
+    case = CASES[case_name]
+    nqubits = case.nqubits if nqubits is None else nqubits
+    nlayers = case.nlayers if nlayers is None else nlayers
+    seed = case.seed if seed is None else seed
+    target_slices = case.target_slices if target_slices is None else target_slices
+
+    circuit = build_contest_circuit(case.circuit_kind, nqubits, nlayers, seed)
+    observable = build_contest_observable(obs_name, nqubits, seed)
+    path = tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices, merge_gates)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    if mode == "contract" and not path.exists():
+        raise FileNotFoundError(f"Missing tree file: {path}. Run search first.")
+
+    rank = 0
+    if mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    exact_value = None
+    if exact and rank == 0 and mode != "search":
+        if nqubits > exact_max_qubits:
+            raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.")
+        exact_value = exact_for_observable(circuit, observable, nqubits)
+
+    config = ExpectationConfig(
+        ansatz="tn",
+        mpi=mpi,
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module="torch",
+        quimb_backend=quimb_backend,
+        dtype=dtype,
+        torch_threads=torch_threads,
+        parallel_opts=quimb_torch_parallel_opts(
+            target_slices=target_slices,
+            target_size=target_size,
+            search_workers=search_workers,
+            torch_threads=torch_threads,
+            search_repeats=search_repeats,
+            search_time=search_time,
+            search_seed=search_seed,
+            merge_gates=merge_gates,
+            search_backend=search_backend,
+            dask_address=dask_address,
+            dask_expected_workers=dask_expected_workers,
+            dask_close_workers=dask_close_workers,
+            debug_trials=debug_trials,
+            search_only=(mode == "search"),
+            save_tree_path=str(path),
+            load_tree_path=str(path),
+            print_stats=False,
+        ),
+    )
+    result = run_cpu_expectation(circuit, observable, config)
+    if mpi and result.rank != 0:
+        return None
+    return {
+        "case": case,
+        "tree_path": path,
+        "circuit": circuit,
+        "observable": observable,
+        "exact": exact_value,
+        "result": result,
+        "abs_error": None if exact_value is None else abs(result.value - exact_value),
+        "rel_error": (
+            None
+            if exact_value is None
+            else abs(result.value - exact_value) / max(abs(exact_value), 1e-15)
+        ),
+    }
diff --git a/src/qibotn/backends/vidal.py b/src/qibotn/backends/vidal.py
index 8fbd4d5..70b57b8 100644
--- a/src/qibotn/backends/vidal.py
+++ b/src/qibotn/backends/vidal.py
@@ -9,6 +9,7 @@ usable while the fast path is expanded.
 from __future__ import annotations
 
 import re
+import time
 from dataclasses import dataclass
 
 import numpy as np
@@ -475,3 +476,511 @@ class VidalBackend(QibotnBackend, NumpyBackend):
             return_array=return_array,
             **prob_kwargs,
         )
+
+
+@dataclass(frozen=True)
+class VidalExpectationResult:
+    value: float
+    seconds: float
+    backend: object
+
+
+@dataclass(frozen=True)
+class VidalBackendComparisonResult:
+    circuit: object
+    observable: object
+    exact: float | None
+    qmatchatea: VidalExpectationResult | None
+    vidal: VidalExpectationResult
+    qmatchatea_error: float | None
+    vidal_error: float | None
+
+
+@dataclass(frozen=True)
+class VidalProfileResult:
+    value: float
+    trace_path: object
+    table_path: object
+    table: str
+
+
+def make_vidal_backend(
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    compile_circuit=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+    mpi_term_batch_size=None,
+    fallback=True,
+):
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        compile_circuit=compile_circuit,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+        mpi_term_batch_size=mpi_term_batch_size,
+        fallback=fallback,
+    )
+    return backend
+
+
+def run_vidal_expectation(
+    circuit,
+    observable,
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    compile_circuit=False,
+    preprocess=True,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+    mpi_term_batch_size=None,
+    fallback=True,
+):
+    backend = make_vidal_backend(
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        compile_circuit=compile_circuit,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+        mpi_term_batch_size=mpi_term_batch_size,
+        fallback=fallback,
+    )
+    start = time.perf_counter()
+    value = backend.expectation(
+        circuit,
+        observable,
+        preprocess=preprocess,
+        compile_circuit=compile_circuit,
+    )
+    return VidalExpectationResult(
+        value=float(np.real(value)),
+        seconds=time.perf_counter() - start,
+        backend=backend,
+    )
+
+
+def run_segmented_vidal_ring_xz(
+    circuit,
+    *,
+    max_bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    comm,
+):
+    from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
+
+    start = time.perf_counter()
+    value, timings = run_segment_vidal_mpi_ring_xz(
+        circuit,
+        max_bond=max_bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        comm=comm,
+    )
+    return VidalExpectationResult(
+        value=float(np.real(value)),
+        seconds=time.perf_counter() - start,
+        backend=timings,
+    )
+
+
+def compare_vidal_backend_qmatchatea(
+    circuit,
+    observable,
+    *,
+    bond=512,
+    cut_ratio=1e-12,
+    tensor_module="torch",
+    exact=None,
+    skip_qmatchatea=False,
+    qmatchatea_compile_circuit=True,
+    qmatchatea_svd_control="E!",
+    vidal_compile_circuit=True,
+    vidal_fallback=True,
+):
+    qmatchatea_result = None
+    if not skip_qmatchatea:
+        qmatchatea_backend = QMatchaTeaBackend()
+        qmatchatea_backend.configure_tn_simulation(
+            ansatz="MPS",
+            max_bond_dimension=bond,
+            cut_ratio=cut_ratio,
+            svd_control=qmatchatea_svd_control,
+            tensor_module=tensor_module,
+            compile_circuit=qmatchatea_compile_circuit,
+            track_memory=False,
+        )
+        start = time.perf_counter()
+        qmatchatea_value = qmatchatea_backend.expectation(
+            circuit,
+            observable,
+            preprocess=False,
+            compile_circuit=qmatchatea_compile_circuit,
+        )
+        qmatchatea_result = VidalExpectationResult(
+            value=float(np.real(qmatchatea_value)),
+            seconds=time.perf_counter() - start,
+            backend=qmatchatea_backend,
+        )
+
+    vidal_backend = VidalBackend()
+    vidal_backend.configure_tn_simulation(
+        ansatz="MPS",
+        max_bond_dimension=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        compile_circuit=vidal_compile_circuit,
+        fallback=vidal_fallback,
+    )
+    start = time.perf_counter()
+    vidal_value = vidal_backend.expectation(
+        circuit,
+        observable,
+        preprocess=False,
+        compile_circuit=vidal_compile_circuit,
+    )
+    vidal_result = VidalExpectationResult(
+        value=float(np.real(vidal_value)),
+        seconds=time.perf_counter() - start,
+        backend=vidal_backend,
+    )
+
+    qmatchatea_error = None
+    vidal_error = None
+    if exact is not None:
+        if qmatchatea_result is not None:
+            qmatchatea_error = abs(qmatchatea_result.value - exact)
+        vidal_error = abs(vidal_result.value - exact)
+
+    return VidalBackendComparisonResult(
+        circuit=circuit,
+        observable=observable,
+        exact=exact,
+        qmatchatea=qmatchatea_result,
+        vidal=vidal_result,
+        qmatchatea_error=qmatchatea_error,
+        vidal_error=vidal_error,
+    )
+
+
+def profile_vidal_expectation(
+    circuit,
+    observable,
+    *,
+    bond=512,
+    cut_ratio=1e-12,
+    torch_threads=32,
+    trace_path,
+    table_path,
+    profile_memory=False,
+    rows=60,
+):
+    import torch
+    from torch.profiler import ProfilerActivity, profile
+
+    from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
+
+    torch.set_num_threads(torch_threads)
+    config = ExpectationConfig(
+        ansatz="mps",
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module="torch",
+        torch_threads=torch_threads,
+    )
+
+    with profile(
+        activities=[ProfilerActivity.CPU],
+        record_shapes=profile_memory,
+        profile_memory=profile_memory,
+        with_stack=profile_memory,
+    ) as prof:
+        result = run_cpu_expectation(circuit, observable, config)
+
+    table = (
+        f"expval={result.value:.16e}\n\n"
+        f"# sorted by self_cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=rows)}\n\n"
+        f"# sorted by cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=rows)}\n"
+    )
+    table_path.parent.mkdir(parents=True, exist_ok=True)
+    table_path.write_text(table, encoding="utf-8")
+    prof.export_chrome_trace(str(trace_path))
+    return VidalProfileResult(
+        value=result.value,
+        trace_path=trace_path,
+        table_path=table_path,
+        table=table,
+    )
+
+
+CONTEST_MPS_BONDS = {"main1": 512, "main2": 1024, "strong": 2048}
+CONTEST_VIDAL_OBSERVABLES = (
+    "boundary_ZZ_q1",
+    "boundary_ZZ_q2",
+    "boundary_ZZ_q3",
+    "long_Z_5_sites",
+    "mixed_XZYZX",
+    "ring_xz",
+    "open_zz",
+    "range2_xx",
+    "complex_iZ0",
+    "dense2_mid",
+    "dense3_spread",
+)
+
+
+def run_contest_mps_case(
+    case_name="main1",
+    *,
+    observables=None,
+    obs_filter="",
+    nqubits=None,
+    nlayers=None,
+    bond="case-default",
+    cut_ratio=1e-12,
+    seed=None,
+    torch_threads=8,
+    exact=False,
+    exact_max_qubits=24,
+):
+    """Run a shared contest-style Vidal/MPS expectation case."""
+    from qibotn.contest_cases import CASES, build_contest_circuit, build_contest_observable
+    from qibotn.expectation_runner import exact_for_observable
+    from qibotn.torch_utils import set_torch_threads
+
+    from mpi4py import MPI
+
+    set_torch_threads(torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    case = CASES[case_name]
+    nqubits = case.nqubits if nqubits is None else nqubits
+    nlayers = case.nlayers if nlayers is None else nlayers
+    seed = case.seed if seed is None else seed
+    if bond == "case-default":
+        bond = CONTEST_MPS_BONDS.get(case_name, 1024)
+    if observables is None:
+        observables = tuple(x.strip() for x in obs_filter.split(",") if x.strip()) or case.observables
+
+    circuit = build_contest_circuit(case.circuit_kind, nqubits, nlayers, seed)
+    records = []
+    for obs_name in observables:
+        observable = build_contest_observable(obs_name, nqubits, seed)
+        exact_value = None
+        if exact and rank == 0:
+            if nqubits > exact_max_qubits:
+                raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.")
+            exact_value = exact_for_observable(circuit, observable, nqubits)
+
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=bond,
+            cut_ratio=cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+
+        comm.Barrier()
+        start = time.perf_counter()
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=True,
+            compile_circuit=False,
+        )
+        seconds = time.perf_counter() - start
+        if rank == 0:
+            records.append(
+                {
+                    "case": case,
+                    "observable": obs_name,
+                    "value": value,
+                    "seconds": seconds,
+                    "exact": exact_value,
+                    "abs_error": None if exact_value is None else abs(value - exact_value),
+                    "rel_error": (
+                        None
+                        if exact_value is None
+                        else abs(value - exact_value) / max(abs(exact_value), 1e-15)
+                    ),
+                    "truncation_error": backend.last_truncation_error,
+                    "max_truncation_error": backend.last_max_truncation_error,
+                }
+            )
+    return records
+
+
+def run_vidal_mpi_contest_case(
+    *,
+    label,
+    kind,
+    nqubits,
+    nlayers,
+    bond,
+    cut_ratio,
+    seed,
+    torch_threads,
+    obs_filter="",
+):
+    """Run the direct Vidal MPI contest observable sweep."""
+    from qibotn.contest_cases import build_contest_circuit, build_contest_observable
+    from qibotn.torch_utils import set_torch_threads
+
+    from mpi4py import MPI
+
+    del label
+    set_torch_threads(torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    circuit = build_contest_circuit(kind, nqubits, nlayers, seed)
+    names = CONTEST_VIDAL_OBSERVABLES
+    if obs_filter:
+        wanted = set(obs_filter.split(","))
+        names = tuple(name for name in names if name in wanted)
+        if not names:
+            raise ValueError(f"obs_filter matched no observables: {obs_filter!r}")
+
+    records = []
+    for obs_name in names:
+        observable = build_contest_observable(obs_name, nqubits, seed)
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=bond,
+            cut_ratio=cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+        comm.Barrier()
+        start = time.perf_counter()
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=True,
+            compile_circuit=False,
+        )
+        seconds = time.perf_counter() - start
+        if rank == 0:
+            records.append(
+                {
+                    "observable": obs_name,
+                    "value": value,
+                    "seconds": seconds,
+                    "truncation_error": backend.last_truncation_error,
+                    "max_truncation_error": backend.last_max_truncation_error,
+                }
+            )
+    return records
+
+
+def build_vidal_validation_circuit(kind, nqubits, nlayers, seed):
+    """Build the circuit family used by Vidal correctness checks."""
+    from qibotn.benchmark_cases import build_circuit
+
+    aliases = {"brickwall": "brickwall_cnot"}
+    return build_circuit(aliases.get(kind, kind), nqubits, nlayers, seed)
+
+
+def run_vidal_validation_cases(
+    *,
+    nqubits=16,
+    nlayers=6,
+    bond=512,
+    seed=42,
+    tensor_module="torch",
+    torch_threads=32,
+    mpi=False,
+    circuits=("brickwall", "reversed_cnot", "rx_ry_cz"),
+    observables=("ring_xz", "open_zz", "mixed_local"),
+):
+    """Run Vidal/TEBD correctness checks against dense statevector references."""
+    from qibotn.benchmark_cases import exact_pauli_sum, observable_terms
+    from qibotn.backends.vidal_tebd import VidalTEBDExecutor
+    from qibotn.torch_utils import set_torch_threads
+
+    set_torch_threads(torch_threads)
+    comm = None
+    rank = 0
+    if mpi:
+        from mpi4py import MPI
+
+        from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+    else:
+        SegmentVidalMPIExecutor = None
+
+    records = []
+    for circuit_kind in circuits:
+        circuit = build_vidal_validation_circuit(circuit_kind, nqubits, nlayers, seed)
+        if rank == 0:
+            exact_values = {
+                obs: exact_pauli_sum(circuit, observable_terms(obs, nqubits), nqubits)
+                for obs in observables
+            }
+        else:
+            exact_values = None
+        if comm is not None:
+            exact_values = comm.bcast(exact_values, root=0)
+
+        for obs_kind in observables:
+            terms = observable_terms(obs_kind, nqubits)
+            start = time.perf_counter()
+            if mpi:
+                executor = SegmentVidalMPIExecutor(
+                    nqubits=nqubits,
+                    max_bond=bond,
+                    cut_ratio=1e-12,
+                    tensor_module=tensor_module,
+                    comm=comm,
+                )
+                executor.run_circuit(circuit)
+                value = executor.expectation_pauli_sum_root(terms)
+            else:
+                executor = VidalTEBDExecutor(
+                    nqubits=nqubits,
+                    max_bond=bond,
+                    cut_ratio=1e-12,
+                    tensor_module=tensor_module,
+                )
+                executor.run_circuit(circuit)
+                value = float(executor.expectation_pauli_sum(terms))
+            if rank != 0:
+                continue
+            seconds = time.perf_counter() - start
+            exact = exact_values[obs_kind]
+            records.append(
+                {
+                    "circuit": circuit_kind,
+                    "observable": obs_kind,
+                    "exact": exact,
+                    "value": value,
+                    "abs_error": abs(value - exact),
+                    "seconds": seconds,
+                }
+            )
+    return records
diff --git a/src/qibotn/benchmark_cases.py b/src/qibotn/benchmark_cases.py
index cee08dc..e3c3d25 100644
--- a/src/qibotn/benchmark_cases.py
+++ b/src/qibotn/benchmark_cases.py
@@ -12,6 +12,7 @@ CIRCUITS = (
     "brickwall_cnot",
     "reversed_cnot",
     "shifted_cz",
+    "rx_ry_cz",
     "rxx_rzz",
     "swap_scramble",
     "ghz_ladder",
@@ -49,14 +50,14 @@ def build_circuit(kind, nqubits, nlayers, seed):
         for qubit in range(nqubits):
             circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
             circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-            if kind in ("rxx_rzz", "swap_scramble"):
+            if kind in ("rx_ry_cz", "rxx_rzz", "swap_scramble"):
                 circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
 
         if kind == "brickwall_cnot":
             add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
         elif kind == "reversed_cnot":
             add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
-        elif kind == "shifted_cz":
+        elif kind in ("shifted_cz", "rx_ry_cz"):
             for qubit in range(layer % 2, nqubits - 1, 2):
                 circuit.add(gates.CZ(qubit, qubit + 1))
         elif kind == "rxx_rzz":
@@ -149,3 +150,22 @@ def exact_pauli_sum(circuit, terms, nqubits):
                 raise ValueError(f"Unsupported Pauli {name!r}.")
         value += coeff * np.vdot(state[flipped], phase * state)
     return float(value.real)
+
+
+def ring_xz_statevector_expectation(state, nqubits, chunk_size=1 << 20):
+    """Compute ``0.5 * sum_i X_i Z_(i+1)`` from a dense state vector."""
+    state = np.asarray(state).reshape(-1)
+    value = 0.0
+    for qubit in range(nqubits):
+        next_qubit = (qubit + 1) % nqubits
+        x_flip = 1 << (nqubits - 1 - qubit)
+        z_shift = nqubits - 1 - next_qubit
+        term = 0.0
+        for start in range(0, state.size, chunk_size):
+            stop = min(start + chunk_size, state.size)
+            indices = np.arange(start, stop, dtype=np.int64)
+            z_bit = (indices >> z_shift) & 1
+            z_phase = 1 - 2 * z_bit
+            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
+        value += 0.5 * term
+    return float(value)
diff --git a/src/qibotn/circuit_convertor.py b/src/qibotn/circuit_convertor.py
deleted file mode 100644
index 900cdf7..0000000
--- a/src/qibotn/circuit_convertor.py
+++ /dev/null
@@ -1,263 +0,0 @@
-import numpy as np
-
-try:
-    import cupy as cp
-except ImportError:  # pragma: no cover - exercised on CPU-only installations
-    cp = None
-
-
-def _require_cupy():
-    if cp is None:
-        raise ImportError(
-            "The cuQuantum circuit converter requires cupy. "
-            "Install the GPU dependencies or use the CPU backend."
-        )
-    return cp
-
-# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
-
-
-class QiboCircuitToEinsum:
-    """Convert a circuit to a Tensor Network (TN) representation.
-
-    The circuit is first processed to an intermediate form by grouping each gate matrix
-    with its corresponding qubit it is acting on to a list. It is then converted to an
-    equivalent TN expression through the class function state_vector_operands()
-    following the Einstein summation convention in the interleave format.
-
-    See document for detail of the format: https://docs.nvidia.com/cuda/cuquantum/python/api/generated/cuquantum.contract.html
-
-    The output is to be used by cuQuantum's contract() for computation of the
-    state vectors of the circuit.
-    """
-
-    def __init__(self, circuit, dtype="complex128"):
-        self.backend = _require_cupy()
-        self.dtype = getattr(self.backend, dtype)
-        self.init_basis_map(self.backend, dtype)
-        self.init_intermediate_circuit(circuit)
-        self.circuit = circuit
-
-    def state_vector_operands(self):
-        """Create the operands for dense vector computation in the interleave
-        format.
-
-        Returns:
-            Operands for the contraction in the interleave format.
-        """
-        input_bitstring = "0" * len(self.active_qubits)
-
-        input_operands = self._get_bitstring_tensors(input_bitstring)
-
-        (
-            mode_labels,
-            qubits_frontier,
-            next_frontier,
-        ) = self._init_mode_labels_from_qubits(self.active_qubits)
-
-        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
-            self.gate_tensors, qubits_frontier, next_frontier
-        )
-
-        operands = input_operands + gate_operands
-        mode_labels += gate_mode_labels
-
-        out_list = []
-        for key in qubits_frontier:
-            out_list.append(qubits_frontier[key])
-
-        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
-        operand_exp_interleave.append(out_list)
-        return operand_exp_interleave
-
-    def _init_mode_labels_from_qubits(self, qubits):
-        n = len(qubits)
-        frontier_dict = {q: i for i, q in enumerate(qubits)}
-        mode_labels = [[i] for i in range(n)]
-        return mode_labels, frontier_dict, n
-
-    def _get_bitstring_tensors(self, bitstring):
-        return [self.basis_map[ibit] for ibit in bitstring]
-
-    def _parse_gates_to_mode_labels_operands(
-        self, gates, qubits_frontier, next_frontier
-    ):
-        mode_labels = []
-        operands = []
-
-        for tensor, gate_qubits in gates:
-            operands.append(tensor)
-            input_mode_labels = []
-            output_mode_labels = []
-            for q in gate_qubits:
-                input_mode_labels.append(qubits_frontier[q])
-                output_mode_labels.append(next_frontier)
-                qubits_frontier[q] = next_frontier
-                next_frontier += 1
-            mode_labels.append(output_mode_labels + input_mode_labels)
-        return mode_labels, operands
-
-    def op_shape_from_qubits(self, nqubits):
-        """Modify tensor to cuQuantum shape.
-
-        Parameters:
-            nqubits (int): The number of qubits in quantum circuit.
-
-        Returns:
-            (qubit_states,input_output) * nqubits
-        """
-        return (2, 2) * nqubits
-
-    def init_intermediate_circuit(self, circuit):
-        """Initialize the intermediate circuit representation.
-
-        This method initializes the intermediate circuit representation by extracting gate matrices and qubit IDs
-        from the given quantum circuit.
-
-        Parameters:
-            circuit (object): The quantum circuit object.
-        """
-        self.gate_tensors = []
-        gates_qubits = []
-
-        for gate in circuit.queue:
-            gate_qubits = gate.control_qubits + gate.target_qubits
-            gates_qubits.extend(gate_qubits)
-
-            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
-            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
-            required_shape = self.op_shape_from_qubits(len(gate_qubits))
-            self.gate_tensors.append(
-                (
-                    self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
-                        required_shape
-                    ),
-                    gate_qubits,
-                )
-            )
-
-        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
-        self.active_qubits = np.unique(gates_qubits)
-
-    def init_basis_map(self, backend, dtype):
-        """Initialize the basis map for the quantum circuit.
-
-        This method initializes a basis map for the quantum circuit, which maps binary
-        strings representing qubit states to their corresponding quantum state vectors.
-
-        Parameters:
-            backend (object): The backend object providing the array conversion method.
-            dtype (object): The data type for the quantum state vectors.
-        """
-        asarray = backend.asarray
-        state_0 = asarray([1, 0], dtype=dtype)
-        state_1 = asarray([0, 1], dtype=dtype)
-
-        self.basis_map = {"0": state_0, "1": state_1}
-
-    def init_inverse_circuit(self, circuit):
-        """Initialize the inverse circuit representation.
-
-        This method initializes the inverse circuit representation by extracting gate matrices and qubit IDs
-        from the given quantum circuit.
-
-        Parameters:
-            circuit (object): The quantum circuit object.
-        """
-        self.gate_tensors_inverse = []
-        gates_qubits_inverse = []
-
-        for gate in circuit.queue:
-            gate_qubits = gate.control_qubits + gate.target_qubits
-            gates_qubits_inverse.extend(gate_qubits)
-
-            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
-            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
-            required_shape = self.op_shape_from_qubits(len(gate_qubits))
-            self.gate_tensors_inverse.append(
-                (
-                    self.backend.asarray(gate.matrix()).reshape(required_shape),
-                    gate_qubits,
-                )
-            )
-
-        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
-        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
-
-    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
-        """Populate the gates for all pauli operators.
-
-        Parameters:
-            pauli_map: A dictionary mapping qubits to pauli operators.
-            dtype: Data type for the tensor operands.
-            backend: The package the tensor operands belong to.
-
-        Returns:
-            A sequence of pauli gates.
-        """
-        if backend is None:
-            backend = _require_cupy()
-        asarray = backend.asarray
-        pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
-        pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
-        pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype)
-        pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype)
-
-        operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z}
-        gates = []
-        for qubit, pauli_char in pauli_map.items():
-            operand = operand_map.get(pauli_char)
-            if operand is None:
-                raise ValueError("pauli string character must be one of I/X/Y/Z")
-            gates.append((operand, (qubit,)))
-        return gates
-
-    def expectation_operands(self, ham_gates):
-        """Create the operands for pauli string expectation computation in the
-        interleave format.
-
-        Parameters:
-            ham_gates: A list of gates derived from Qibo hamiltonian object.
-
-        Returns:
-            Operands for the contraction in the interleave format.
-        """
-        input_bitstring = "0" * self.circuit.nqubits
-
-        input_operands = self._get_bitstring_tensors(input_bitstring)
-
-        (
-            mode_labels,
-            qubits_frontier,
-            next_frontier,
-        ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits))
-
-        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
-            self.gate_tensors, qubits_frontier, next_frontier
-        )
-
-        operands = input_operands + gate_operands
-        mode_labels += gate_mode_labels
-
-        self.init_inverse_circuit(self.circuit.invert())
-
-        next_frontier = max(qubits_frontier.values()) + 1
-
-        gates_inverse = ham_gates + self.gate_tensors_inverse
-
-        (
-            gate_mode_labels_inverse,
-            gate_operands_inverse,
-        ) = self._parse_gates_to_mode_labels_operands(
-            gates_inverse, qubits_frontier, next_frontier
-        )
-        mode_labels = (
-            mode_labels
-            + gate_mode_labels_inverse
-            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
-        )
-        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
-
-        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
-
-        return operand_exp_interleave
diff --git a/src/qibotn/circuit_to_mps.py b/src/qibotn/circuit_to_mps.py
deleted file mode 100644
index 48cf55d..0000000
--- a/src/qibotn/circuit_to_mps.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import numpy as np
-
-from qibotn.circuit_convertor import QiboCircuitToEinsum
-from qibotn.mps_utils import apply_gate, initial
-
-try:
-    import cupy as cp
-    import cuquantum.bindings.cutensornet as cutn
-except ImportError:  # pragma: no cover - exercised on CPU-only installations
-    cp = None
-    cutn = None
-
-
-def _require_cuquantum():
-    if cp is None or cutn is None:
-        raise ImportError(
-            "The cuQuantum MPS converter requires cupy and cuquantum. "
-            "Install the GPU dependencies or use the CPU backend."
-        )
-
-
-class QiboCircuitToMPS:
-    """A helper class to convert Qibo circuit to MPS.
-
-    Parameters:
-        circ_qibo: The quantum circuit object.
-        gate_algo(dict): Dictionary for SVD and QR settings.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
-        rand_seed(int): Seed for random number generator.
-    """
-
-    def __init__(
-        self,
-        circ_qibo,
-        gate_algo,
-        dtype="complex128",
-        rand_seed=0,
-    ):
-        _require_cuquantum()
-        np.random.seed(rand_seed)
-        cp.random.seed(rand_seed)
-
-        self.num_qubits = circ_qibo.nqubits
-        self.handle = cutn.create()
-        self.dtype = dtype
-        self.mps_tensors = initial(self.num_qubits, dtype=dtype)
-        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
-
-        for gate, qubits in circuitconvertor.gate_tensors:
-            # mapping from qubits to qubit indices
-            # apply the gate in-place
-            apply_gate(
-                self.mps_tensors,
-                gate,
-                qubits,
-                algorithm=gate_algo,
-                options={"handle": self.handle},
-            )
-
-    def __del__(self):
-        handle = getattr(self, "handle", None)
-        if cutn is not None and handle is not None:
-            cutn.destroy(handle)
diff --git a/src/qibotn/contest_cases.py b/src/qibotn/contest_cases.py
new file mode 100644
index 0000000..dfb7962
--- /dev/null
+++ b/src/qibotn/contest_cases.py
@@ -0,0 +1,241 @@
+"""Shared contest-style circuits and observables for qibotn tools."""
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+from qibotn.backends.quimb import quimb_torch_parallel_opts
+
+
+@dataclass(frozen=True)
+class CaseSpec:
+    circuit_kind: str
+    observables: tuple[str, ...]
+    nqubits: int
+    nlayers: int
+    seed: int
+    target_slices: int | None = None
+
+
+CASES = {
+    "main1": CaseSpec(
+        circuit_kind="rxx_rzz_chain",
+        observables=("ring_xz",),
+        nqubits=37,
+        nlayers=20,
+        seed=31001,
+        target_slices=None,
+    ),
+    "main2": CaseSpec(
+        circuit_kind="scramble_chain",
+        observables=("open_zz", "range2_xx"),
+        nqubits=36,
+        nlayers=18,
+        seed=31002,
+        target_slices=None,
+    ),
+    "strong": CaseSpec(
+        circuit_kind="reversed_cnot",
+        observables=("ring_xz", "long_z_string"),
+        nqubits=40,
+        nlayers=24,
+        seed=41001,
+        target_slices=None,
+    ),
+}
+
+
+def _add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
+    for qubit in range(nqubits):
+        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        if include_rx:
+            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
+
+
+def _add_brickwall(circuit, nqubits, gate, layer, reverse=False):
+    for qubit in range(0, nqubits - 1, 2):
+        if reverse and layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+    for qubit in range(1, nqubits - 1, 2):
+        if reverse and not layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+
+
+def build_contest_circuit(kind, nqubits, nlayers, seed):
+    """Build one of the contest-style benchmark circuits."""
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    if kind == "ghz_ladder":
+        circuit.add(gates.H(0))
+        for qubit in range(nqubits - 1):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+        return circuit
+
+    for layer in range(nlayers):
+        if kind in {"brickwall_cnot", "reversed_cnot", "shifted_cz"}:
+            _add_single_qubit_layer(circuit, nqubits, rng)
+        elif kind in {"rxx_rzz", "swap_scramble"}:
+            _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+        elif kind in {"rxx_rzz_chain", "scramble_chain", "scramble"}:
+            _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+        if kind == "brickwall_cnot":
+            _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
+        elif kind == "reversed_cnot":
+            _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
+        elif kind == "shifted_cz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+        elif kind == "rxx_rzz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+        elif kind == "swap_scramble":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+                if layer % 4 == 3:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+        elif kind == "rxx_rzz_chain":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+        elif kind == "scramble_chain":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+        elif kind == "scramble":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+
+    return circuit
+
+
+def _dense_observable(nqubits, qubits, seed, dim):
+    del nqubits
+    rng = np.random.default_rng(seed)
+    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
+    matrix = (raw + raw.conj().T) / 2.0
+    matrix = matrix / np.linalg.norm(matrix)
+    return {"matrix": matrix, "qubits": list(qubits)}
+
+
+def build_contest_observable(kind, nqubits, seed=0):
+    """Build one of the shared contest observables."""
+    q1 = nqubits // 4
+    q2 = nqubits // 2
+    q3 = (3 * nqubits) // 4
+    last = nqubits - 1
+
+    if kind == "ring_xz":
+        form = 0
+        for qubit in range(nqubits):
+            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "open_zz":
+        form = 0
+        for qubit in range(nqubits - 1):
+            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "range2_xx":
+        form = 0
+        for qubit in range(nqubits - 2):
+            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "mixed_local":
+        form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "long_z_string":
+        stride = max(1, nqubits // 16)
+        form = None
+        for qubit in range(0, nqubits, stride):
+            form = Z(qubit) if form is None else form * Z(qubit)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "boundary_ZZ_q1":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
+    if kind == "boundary_ZZ_q2":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
+    if kind == "boundary_ZZ_q3":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
+    if kind == "long_Z_5_sites":
+        return hamiltonians.SymbolicHamiltonian(
+            form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)
+        )
+    if kind == "mixed_XZYZX":
+        return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
+    if kind == "complex_iZ0":
+        return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
+    if kind == "dense2_mid":
+        return _dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
+    if kind == "dense3_spread":
+        return _dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices, merge_gates=True):
+    slice_label = "auto" if target_slices is None else f"s{target_slices}"
+    merge_label = "merge" if merge_gates else "nomerge"
+    return (
+        Path(tree_dir)
+        / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}_{merge_label}.pkl"
+    )
+
+
+def selected_observables(args, case):
+    if args.observables:
+        return tuple(args.observables)
+    if args.obs_filter:
+        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
+    return case.observables
+
+
+def apply_case_defaults(args):
+    case = CASES[args.case]
+    if args.nqubits is None:
+        args.nqubits = case.nqubits
+    if args.nlayers is None:
+        args.nlayers = case.nlayers
+    if args.seed is None:
+        args.seed = case.seed
+    if args.tn_target_slices is None:
+        args.tn_target_slices = case.target_slices
+    args.observables = selected_observables(args, case)
+
+
+def build_parallel_opts(args, tree_file=None, search_only=False):
+    return quimb_torch_parallel_opts(
+        target_slices=args.tn_target_slices,
+        target_size=args.tn_target_size,
+        search_workers=args.tn_search_workers,
+        torch_threads=args.torch_threads,
+        search_repeats=args.tn_search_repeats,
+        search_time=args.tn_search_time,
+        search_seed=args.tn_search_seed,
+        merge_gates=args.merge_gates,
+        search_backend=args.tn_search_backend,
+        dask_address=args.dask_address,
+        dask_expected_workers=args.dask_expected_workers,
+        dask_close_workers=args.dask_close_workers,
+        debug_trials=args.tn_debug_trials,
+        search_only=search_only,
+        save_tree_path=str(tree_file) if tree_file is not None else None,
+        load_tree_path=str(tree_file) if tree_file is not None else None,
+        print_stats=False,
+    )
diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py
index f2fbf71..144e1f8 100644
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,8 +1,10 @@
 from mpi4py import MPI
 
-from qibotn.circuit_convertor import QiboCircuitToEinsum
-from qibotn.circuit_to_mps import QiboCircuitToMPS
-from qibotn.mps_contraction_helper import MPSContractionHelper
+from qibotn.backends.cutensornet_helpers import (
+    MPSContractionHelper,
+    QiboCircuitToEinsum,
+    QiboCircuitToMPS,
+)
 from qibotn.observables import (
     build_observable,
     check_observable,
diff --git a/src/qibotn/expectation_runner.py b/src/qibotn/expectation_runner.py
index 9592974..59ef1b7 100644
--- a/src/qibotn/expectation_runner.py
+++ b/src/qibotn/expectation_runner.py
@@ -8,7 +8,15 @@ from dataclasses import dataclass
 import numpy as np
 from qibo.backends import construct_backend
 
-from qibotn.benchmark_cases import exact_pauli_sum
+from qibotn.benchmark_cases import (
+    CIRCUITS,
+    OBSERVABLES,
+    build_circuit,
+    exact_pauli_sum,
+    observable_terms,
+    parse_names,
+    terms_to_dict,
+)
 from qibotn.observables import check_observable
 
 
@@ -77,6 +85,18 @@ class ExpectationResult:
     parallel_stats: list | None = None
 
 
+@dataclass
+class BenchmarkExpectationRecord:
+    circuit: str
+    observable: str
+    value: float
+    seconds: float
+    exact: float | None = None
+    abs_error: float | None = None
+    rel_error: float | None = None
+    parallel_stats: list | None = None
+
+
 def _config_from_kwargs(**kwargs):
     fields = ExpectationConfig.__dataclass_fields__
     config_kwargs = {name: kwargs.pop(name) for name in list(kwargs) if name in fields}
@@ -155,3 +175,148 @@ def mps_expectation(circuit, observable=None, *, return_result=False, **kwargs):
         return_result=return_result,
         **kwargs,
     )
+
+
+def cpu_benchmark_parallel_opts(
+    *,
+    target_slices=None,
+    target_size=2**32,
+    search_workers=None,
+    torch_threads=8,
+    search_repeats=128,
+    search_time=60.0,
+    search_backend="dask",
+    dask_address=None,
+    dask_close_workers=False,
+    save_tree_path=None,
+    load_tree_path=None,
+    search_only=False,
+    debug_trials=False,
+    contract_implementation=None,
+    print_stats=True,
+):
+    """Build parallel TN options for the CPU expectation backend."""
+    slicing_opts = {}
+    if target_slices is not None:
+        slicing_opts["target_slices"] = target_slices
+    if target_size is not None:
+        slicing_opts["target_size"] = target_size
+
+    opts = {
+        "slicing_opts": slicing_opts or None,
+        "search_workers": search_workers or torch_threads,
+        "max_repeats": search_repeats,
+        "max_time": search_time,
+        "print_stats": print_stats,
+    }
+    if search_backend is not None:
+        opts["search_backend"] = search_backend
+    if dask_address is not None:
+        opts["dask_address"] = dask_address
+    if save_tree_path is not None:
+        opts["save_tree_path"] = save_tree_path
+    if load_tree_path is not None:
+        opts["load_tree_path"] = load_tree_path
+    if search_only:
+        opts["search_only"] = True
+    if debug_trials:
+        opts["debug_trials"] = True
+    if contract_implementation is not None:
+        opts["contract_implementation"] = contract_implementation
+    if dask_close_workers:
+        opts["dask_close_workers"] = True
+    return opts
+
+
+def run_cpu_benchmark_cases(
+    *,
+    nqubits=40,
+    nlayers=30,
+    bond=1024,
+    cut_ratio=1e-12,
+    seed=42,
+    torch_threads=8,
+    quimb_backend="torch",
+    dtype="complex128",
+    ansatz="tn",
+    mpi=False,
+    exact=False,
+    exact_max_qubits=24,
+    circuits=("brickwall_cnot",),
+    observables=("ring_xz",),
+    pauli_pattern=None,
+    parallel_opts=None,
+):
+    """Run the reusable CPU TN/MPS benchmark cases.
+
+    This is the importable library entrypoint for reusable CPU benchmark cases.
+    """
+    selected_circuits = parse_names(list(circuits), CIRCUITS, "circuits")
+    selected_observables = (
+        []
+        if pauli_pattern
+        else parse_names(list(observables), OBSERVABLES, "observables")
+    )
+
+    rank = 0
+    if mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    config = ExpectationConfig(
+        ansatz=ansatz,
+        mpi=mpi,
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module="torch",
+        quimb_backend=quimb_backend,
+        dtype=dtype,
+        torch_threads=torch_threads,
+        parallel_opts=parallel_opts or {},
+    )
+
+    records = []
+    for circuit_kind in selected_circuits:
+        circuit = build_circuit(circuit_kind, nqubits, nlayers, seed)
+        named_observables = (
+            [(f"pattern:{pauli_pattern}", {"pauli_string_pattern": pauli_pattern})]
+            if pauli_pattern
+            else [
+                (obs_kind, terms_to_dict(observable_terms(obs_kind, nqubits)))
+                for obs_kind in selected_observables
+            ]
+        )
+
+        for obs_name, observable in named_observables:
+            exact_value = None
+            if exact and rank == 0:
+                if nqubits > exact_max_qubits:
+                    raise ValueError(
+                        f"exact reference is limited to {exact_max_qubits} qubits."
+                    )
+                exact_value = exact_for_observable(circuit, observable, nqubits)
+
+            result = run_cpu_expectation(circuit, observable, config)
+            if mpi and result.rank != 0:
+                continue
+
+            abs_error = None if exact_value is None else abs(result.value - exact_value)
+            rel_error = (
+                None
+                if exact_value is None
+                else abs_error / max(abs(exact_value), 1e-15)
+            )
+            records.append(
+                BenchmarkExpectationRecord(
+                    circuit=circuit_kind,
+                    observable=obs_name,
+                    value=result.value,
+                    seconds=result.seconds,
+                    exact=exact_value,
+                    abs_error=abs_error,
+                    rel_error=rel_error,
+                    parallel_stats=result.parallel_stats,
+                )
+            )
+    return records
diff --git a/src/qibotn/mps_contraction_helper.py b/src/qibotn/mps_contraction_helper.py
deleted file mode 100644
index b44cfb2..0000000
--- a/src/qibotn/mps_contraction_helper.py
+++ /dev/null
@@ -1,131 +0,0 @@
-try:
-    from cuquantum.tensornet import contract, contract_path
-except ImportError:  # pragma: no cover - exercised on CPU-only installations
-    contract = None
-    contract_path = None
-
-
-def _require_cuquantum():
-    if contract is None or contract_path is None:
-        raise ImportError(
-            "The cuQuantum MPS contraction helper requires cuquantum. "
-            "Install the GPU dependencies or use the CPU backend."
-        )
-
-# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
-
-
-class MPSContractionHelper:
-    """A helper class to compute various quantities for a given MPS.
-
-    Interleaved format is used to construct the input args for `cuquantum.contract`.
-
-    Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
-
-    The following compute quantities are supported:
-
-        - the norm of the MPS.
-        - the equivalent state vector from the MPS.
-        - the expectation value for a given operator.
-        - the equivalent state vector after multiplying an MPO to an MPS.
-
-    Parameters:
-        num_qubits: The number of qubits for the MPS.
-    """
-
-    def __init__(self, num_qubits):
-        self.num_qubits = num_qubits
-        self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)]
-        offset = 2 * num_qubits + 1
-        self.ket_modes = [
-            (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits)
-        ]
-
-    def contract_norm(self, mps_tensors, options=None):
-        """Contract the corresponding tensor network to form the norm of the
-        MPS.
-
-        Parameters:
-            mps_tensors: A list of rank-3 ndarray-like tensor objects.
-                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
-                the physical mode, and then the bonding index to the i+1th tensor.
-            options: Specify the contract and decompose options.
-
-        Returns:
-            The norm of the MPS.
-        """
-        interleaved_inputs = []
-        for i, o in enumerate(mps_tensors):
-            interleaved_inputs.extend(
-                [o, self.bra_modes[i], o.conj(), self.ket_modes[i]]
-            )
-        interleaved_inputs.append([])  # output
-        return self._contract(interleaved_inputs, options=options).real
-
-    def contract_state_vector(self, mps_tensors, options=None):
-        """Contract the corresponding tensor network to form the state vector
-        representation of the MPS.
-
-        Parameters:
-            mps_tensors: A list of rank-3 ndarray-like tensor objects.
-                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
-                the physical mode, and then the bonding index to the i+1th tensor.
-            options: Specify the contract and decompose options.
-
-        Returns:
-            An ndarray-like object as the state vector.
-        """
-        interleaved_inputs = []
-        for i, o in enumerate(mps_tensors):
-            interleaved_inputs.extend([o, self.bra_modes[i]])
-        output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes])
-        interleaved_inputs.append(output_modes)  # output
-        return self._contract(interleaved_inputs, options=options)
-
-    def contract_expectation(
-        self, mps_tensors, operator, qubits, options=None, normalize=False
-    ):
-        """Contract the corresponding tensor network to form the expectation of
-        the MPS.
-
-        Parameters:
-            mps_tensors: A list of rank-3 ndarray-like tensor objects.
-                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
-                the physical mode, and then the bonding index to the i+1th tensor.
-            operator: A ndarray-like tensor object.
-                The modes of the operator are expected to be output qubits followed by input qubits, e.g,
-                ``A, B, a, b`` where `a, b` denotes the inputs and `A, B'` denotes the outputs.
-            qubits: A sequence of integers specifying the qubits that the operator is acting on.
-            options: Specify the contract and decompose options.
-            normalize: Whether to scale the expectation value by the normalization factor.
-
-        Returns:
-            An ndarray-like object as the state vector.
-        """
-
-        interleaved_inputs = []
-        extra_mode = 3 * self.num_qubits + 2
-        operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits]
-        qubits = list(qubits)
-        for i, o in enumerate(mps_tensors):
-            interleaved_inputs.extend([o, self.bra_modes[i]])
-            k_modes = self.ket_modes[i]
-            if i in qubits:
-                k_modes = (k_modes[0], extra_mode, k_modes[2])
-                q = qubits.index(i)
-                operator_modes[q] = extra_mode  # output modes
-                extra_mode += 1
-            interleaved_inputs.extend([o.conj(), k_modes])
-        interleaved_inputs.extend([operator, tuple(operator_modes)])
-        interleaved_inputs.append([])  # output
-        if normalize:
-            norm = self.contract_norm(mps_tensors, options=options)
-        else:
-            norm = 1
-        return self._contract(interleaved_inputs, options=options) / norm
-
-    def _contract(self, interleaved_inputs, options=None):
-        _require_cuquantum()
-        path = contract_path(*interleaved_inputs, options=options)[0]
-
-        return contract(*interleaved_inputs, options=options, optimize={"path": path})
diff --git a/src/qibotn/mps_utils.py b/src/qibotn/mps_utils.py
deleted file mode 100644
index ff3d010..0000000
--- a/src/qibotn/mps_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-try:
-    import cupy as cp
-    from cuquantum.tensornet import contract
-    from cuquantum.tensornet.experimental import contract_decompose
-except ImportError:  # pragma: no cover - exercised on CPU-only installations
-    cp = None
-    contract = None
-    contract_decompose = None
-
-
-def _require_cuquantum():
-    if cp is None or contract is None or contract_decompose is None:
-        raise ImportError(
-            "The cuQuantum MPS helpers require cupy and cuquantum. "
-            "Install the GPU dependencies or use the CPU backend."
-        )
-
-
-def initial(num_qubits, dtype):
-    r"""Generate the MPS with an initial state of :math:`\ket{00...00}`
-
-    Parameters:
-        num_qubits: Number of qubits in the Quantum Circuit.
-        dtype: Either single ("complex64") or double (complex128) precision.
-
-    Returns:
-        The initial MPS tensors.
-    """
-    _require_cuquantum()
-    state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
-    mps_tensors = [state_tensor] * num_qubits
-    return mps_tensors
-
-
-def mps_site_right_swap(mps_tensors, i, **kwargs):
-    """Perform the swap operation between the ith and i+1th MPS tensors.
-
-    Parameters:
-        mps_tensors: Tensors representing MPS
-        i (int): index of the tensor to swap
-
-    Returns:
-        The updated MPS tensors.
-    """
-    _require_cuquantum()
-    # contraction followed by QR decomposition
-    a, _, b = contract_decompose(
-        "ipj,jqk->iqj,jpk",
-        *mps_tensors[i : i + 2],
-        algorithm=kwargs.get("algorithm", None),
-        options=kwargs.get("options", None),
-    )
-    mps_tensors[i : i + 2] = (a, b)
-    return mps_tensors
-
-
-def apply_gate(mps_tensors, gate, qubits, **kwargs):
-    """Apply the gate operand to the MPS tensors in-place.
-
-    # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
-
-    Parameters:
-        mps_tensors: A list of rank-3 ndarray-like tensor objects.
-            The indices of the ith tensor are expected to be the bonding index to the i-1 tensor,
-            the physical mode, and then the bonding index to the i+1th tensor.
-        gate: A ndarray-like tensor object representing the gate operand.
-            The modes of the gate is expected to be output qubits followed by input qubits, e.g,
-            ``A, B, a, b`` where ``a, b`` denotes the inputs and ``A, B`` denotes the outputs.
-        qubits: A sequence of integers denoting the qubits that the gate is applied onto.
-        algorithm: The contract and decompose algorithm to use for gate application.
-            Can be either a `dict` or a `ContractDecomposeAlgorithm`.
-        options: Specify the contract and decompose options.
-
-    Returns:
-        The updated MPS tensors.
-    """
-
-    _require_cuquantum()
-    n_qubits = len(qubits)
-    if n_qubits == 1:
-        # single-qubit gate
-        i = qubits[0]
-        mps_tensors[i] = contract(
-            "ipj,qp->iqj", mps_tensors[i], gate, options=kwargs.get("options", None)
-        )  # in-place update
-    elif n_qubits == 2:
-        # two-qubit gate
-        i, j = qubits
-        if i > j:
-            # swap qubits order
-            return apply_gate(mps_tensors, gate.transpose(1, 0, 3, 2), (j, i), **kwargs)
-        elif i + 1 == j:
-            # two adjacent qubits
-            a, _, b = contract_decompose(
-                "ipj,jqk,rspq->irj,jsk",
-                *mps_tensors[i : i + 2],
-                gate,
-                algorithm=kwargs.get("algorithm", None),
-                options=kwargs.get("options", None),
-            )
-            mps_tensors[i : i + 2] = (a, b)  # in-place update
-        else:
-            # non-adjacent two-qubit gate
-            # step 1: swap i with i+1
-            mps_site_right_swap(mps_tensors, i, **kwargs)
-            # step 2: apply gate to (i+1, j) pair. This amounts to a recursive swap until the two qubits are adjacent
-            apply_gate(mps_tensors, gate, (i + 1, j), **kwargs)
-            # step 3: swap back i and i+1
-            mps_site_right_swap(mps_tensors, i, **kwargs)
-    else:
-        raise NotImplementedError("Only one- and two-qubit gates supported")
diff --git a/src/qibotn/observables.py b/src/qibotn/observables.py
index 7f3c242..b90a2da 100644
--- a/src/qibotn/observables.py
+++ b/src/qibotn/observables.py
@@ -35,7 +35,17 @@ def check_observable(observable, circuit_nqubit):
     if isinstance(observable, dict):
         return create_hamiltonian_from_dict(observable, circuit_nqubit)
     if isinstance(observable, hamiltonians.SymbolicHamiltonian):
-        return observable
+        if observable.nqubits == circuit_nqubit:
+            return observable
+        if observable.nqubits > circuit_nqubit:
+            raise ValueError(
+                "Observable has more qubits than the circuit: "
+                f"{observable.nqubits} > {circuit_nqubit}."
+            )
+        return hamiltonians.SymbolicHamiltonian(
+            form=observable.form,
+            nqubits=circuit_nqubit,
+        )
     try:
         return hamiltonians.SymbolicHamiltonian(form=observable)
     except Exception as exc:
diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py
index 0fd577c..d7746b1 100644
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
@@ -1,12 +1,16 @@
 """Parallel path search and contraction utilities for tensor networks."""
+import importlib
 import os
 import pickle
 import signal
 import time
-from math import log2, log10
-import numpy as np
-from dataclasses import dataclass
+from collections import Counter, defaultdict
 from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed
+from dataclasses import dataclass
+from math import log2, log10
+from pathlib import Path
+
+import numpy as np
 
 try:
     from mpi4py import MPI
@@ -40,6 +44,12 @@ def _optimizer_search_stats(opt):
     }
 
 
+def _tree_search_stats(tree):
+    if tree is None:
+        return {}
+    return getattr(tree, "qibotn_search_stats", {}) or {}
+
+
 def _attach_search_stats(tree, opt):
     try:
         tree.qibotn_search_stats = _optimizer_search_stats(opt)
@@ -48,6 +58,47 @@ def _attach_search_stats(tree, opt):
     return tree
 
 
+def _search_seed_kwargs(optlib, seed):
+    if optlib == "random":
+        return {"seed": seed}
+    if optlib is None:
+        return {"sampler_opts": {"seed": seed}}
+    return {}
+
+
+def _fallback_greedy_tree(tn, output_inds, slicing_opts=None, error=None):
+    import cotengra as ctg
+
+    tree = tn.contraction_tree(
+        output_inds=output_inds,
+        optimize=ctg.GreedyOptimizer(),
+    )
+    if slicing_opts:
+        target_size = slicing_opts.get("target_size")
+        target_slices = slicing_opts.get("target_slices")
+        if target_size is not None:
+            tree.slice_(target_size=target_size)
+        elif target_slices is not None:
+            tree.slice_(target_slices=target_slices)
+    try:
+        tree.qibotn_search_stats = {
+            "completed_trials": 0,
+            "finite_trials": 0,
+            "failed_trials": 0,
+            "requested_trials": 0,
+            "trial_seconds_sum": 0.0,
+            "best_score": float("nan"),
+            "best_flops": float("nan"),
+            "best_write": float("nan"),
+            "best_size": float("nan"),
+            "fallback": "greedy",
+            "fallback_error": repr(error) if error is not None else None,
+        }
+    except Exception:
+        pass
+    return tree
+
+
 def _dask_worker_slots(client):
     info = client.scheduler_info(n_workers=-1)
     workers = info.get("workers", {})
@@ -218,13 +269,18 @@ def _search_chunk(
     slicing_opts,
     optlib=None,
 ):
-    import random, cotengra as ctg
+    import random
 
+    import cotengra as ctg
+
+    seed = int(seed)
     random.seed(seed)
+    np.random.seed(seed % (2**32))
     tn = pickle.loads(tn_bytes)
     kwargs = {}
     if optlib is not None:
         kwargs["optlib"] = optlib
+    kwargs.update(_search_seed_kwargs(optlib, seed))
     opt = ctg.HyperOptimizer(
         methods=SEARCH_METHODS,
         max_repeats=repeats,
@@ -266,7 +322,15 @@ def _kill_pool(pool):
     pool.shutdown(wait=False)
 
 
-def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=None, trial_timeout=None):
+def _serial_search(
+    tn_bytes,
+    output_inds,
+    repeats,
+    seed,
+    max_time,
+    slicing_opts=None,
+    trial_timeout=None,
+):
     import time
 
     if trial_timeout is None:
@@ -287,7 +351,13 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts=
             break
         timeout = min(trial_timeout, deadline - time.time())
         pool = ProcessPoolExecutor(max_workers=1)
-        fut = pool.submit(_run_single_trial, tn_bytes, output_inds, seed * 10000 + i, slicing_opts)
+        fut = pool.submit(
+            _run_single_trial,
+            tn_bytes,
+            output_inds,
+            seed * 10000 + i,
+            slicing_opts,
+        )
         try:
             cost, tree = fut.result(timeout=timeout)
             if cost < best_cost:
@@ -304,15 +374,30 @@ def _split_repeats(total_repeats, n_workers):
     n_workers = max(1, int(n_workers))
     total_repeats = max(1, int(total_repeats))
     chunk, extra = divmod(total_repeats, n_workers)
-    return [chunk + (1 if i < extra else 0) for i in range(n_workers) if chunk + (1 if i < extra else 0) > 0]
+    return [
+        chunk + (1 if i < extra else 0)
+        for i in range(n_workers)
+        if chunk + (1 if i < extra else 0) > 0
+    ]
 
 
-def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None):
+def _processpool_search(
+    tn,
+    output_inds,
+    total_repeats,
+    n_workers,
+    max_time,
+    slicing_opts=None,
+    trial_timeout=None,
+    search_seed=0,
+):
     tn_bytes = pickle.dumps(tn)
     repeat_chunks = _split_repeats(total_repeats, n_workers)
     pool = ProcessPoolExecutor(max_workers=len(repeat_chunks))
     futures = []
-    for seed, repeats in enumerate(repeat_chunks):
+    errors = []
+    for worker_id, repeats in enumerate(repeat_chunks):
+        seed = int(search_seed) + worker_id
         futures.append(
             pool.submit(
                 _serial_search,
@@ -334,14 +419,34 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, sli
                 cost, tree = fut.result()
                 if cost < best_cost:
                     best_cost, best_tree = cost, tree
-            except Exception:
-                pass
+            except Exception as exc:
+                errors.append(repr(exc))
     except TimeoutError:
-        pass
+        errors.append("TimeoutError()")
     finally:
         for fut in futures:
             fut.cancel()
         _kill_pool(pool)
+    if best_tree is None:
+        if errors:
+            print(
+                "qibotn_search_failed "
+                f"backend=processpool errors={errors[:3]} "
+                f"num_errors={len(errors)} fallback=greedy",
+                flush=True,
+            )
+        else:
+            print(
+                "qibotn_search_failed "
+                "backend=processpool errors=[] fallback=greedy",
+                flush=True,
+            )
+        return _fallback_greedy_tree(
+            tn,
+            output_inds,
+            slicing_opts=slicing_opts,
+            error=errors[:3],
+        )
     return best_tree
 
 
@@ -357,6 +462,7 @@ def _dask_search(
     debug_trials=False,
     close_workers=False,
     expected_workers=None,
+    search_seed=0,
 ):
     """Run one centralized cotengra hyper-optimizer over a dask pool.
 
@@ -371,8 +477,14 @@ def _dask_search(
             "`pip install distributed` or the package extra that provides it."
         ) from exc
 
+    import random
+
     import cotengra as ctg
 
+    search_seed = int(search_seed)
+    random.seed(search_seed)
+    np.random.seed(search_seed % (2**32))
+
     _patch_cotengra_dask_as_completed()
     _patch_cotengra_dask_submit(debug_trials=debug_trials)
 
@@ -400,6 +512,7 @@ def _dask_search(
     kwargs = {}
     if optlib is not None:
         kwargs["optlib"] = optlib
+    kwargs.update(_search_seed_kwargs(optlib, search_seed))
 
     retire_workers = []
     try:
@@ -470,10 +583,12 @@ def _mpi_search(
     dask_address=None,
     debug_trials=False,
     dask_close_workers=False,
+    search_seed=0,
 ):
     comm = MPI.COMM_WORLD
     rank, size = comm.Get_rank(), comm.Get_size()
     search_backend = search_backend or "processpool"
+    search_seed = int(search_seed)
 
     if search_backend == "dask":
         if not dask_address:
@@ -496,6 +611,7 @@ def _mpi_search(
                     n_workers=n_workers,
                     debug_trials=debug_trials,
                     close_workers=dask_close_workers,
+                    search_seed=search_seed,
                 )
                 payload = ("ok", tree)
             except Exception as exc:
@@ -518,6 +634,7 @@ def _mpi_search(
         max_time,
         slicing_opts,
         trial_timeout,
+        search_seed=search_seed + rank * max(1, n_workers or 1),
     )
     local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf")
 
@@ -531,11 +648,22 @@ def _mpi_search(
     return comm.bcast(best_tree, root=0)
 
 
-def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
-                         max_time=300, n_workers=48, slicing_opts=None,
-                         trial_timeout=None, search_backend=None,
-                         dask_address=None, debug_trials=False,
-                         dask_close_workers=False, expected_workers=None):
+def parallel_path_search(
+    tn,
+    output_inds,
+    method="processpool",
+    total_repeats=1024,
+    max_time=300,
+    n_workers=48,
+    slicing_opts=None,
+    trial_timeout=None,
+    search_backend=None,
+    dask_address=None,
+    debug_trials=False,
+    dask_close_workers=False,
+    expected_workers=None,
+    search_seed=0,
+):
     """Parallel contraction path search.
 
     Args:
@@ -546,11 +674,32 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
         slicing_opts: cotengra slicing options for memory control
         trial_timeout: Per-trial timeout (seconds); kills and skips hung trials
     """
-    if method == 'serial':
+    if method == "serial":
         tn_bytes = pickle.dumps(tn)
-        _, tree = _serial_search(tn_bytes, output_inds, total_repeats, 0, max_time, slicing_opts, trial_timeout)
+        try:
+            _, tree = _serial_search(
+                tn_bytes,
+                output_inds,
+                total_repeats,
+                search_seed,
+                max_time,
+                slicing_opts,
+                trial_timeout,
+            )
+        except Exception as exc:
+            print(
+                "qibotn_search_failed "
+                f"backend=serial error={exc!r} fallback=greedy",
+                flush=True,
+            )
+            return _fallback_greedy_tree(
+                tn,
+                output_inds,
+                slicing_opts=slicing_opts,
+                error=exc,
+            )
         return tree
-    elif method == 'mpi':
+    if method == "mpi":
         if not _HAVE_MPI:
             raise ImportError("mpi4py not available")
         return _mpi_search(
@@ -565,10 +714,20 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
             dask_address=dask_address,
             debug_trials=debug_trials,
             dask_close_workers=dask_close_workers,
+            search_seed=search_seed,
         )
-    elif method == 'processpool':
-        return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
-    elif method == 'dask':
+    if method == "processpool":
+        return _processpool_search(
+            tn,
+            output_inds,
+            total_repeats,
+            n_workers,
+            max_time,
+            slicing_opts,
+            trial_timeout,
+            search_seed=search_seed,
+        )
+    if method == "dask":
         return _dask_search(
             tn,
             output_inds,
@@ -580,9 +739,9 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
             debug_trials=debug_trials,
             close_workers=dask_close_workers,
             expected_workers=expected_workers,
+            search_seed=search_seed,
         )
-    else:
-        raise ValueError(f"Unknown method: {method}")
+    raise ValueError(f"Unknown method: {method}")
 
 
 def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256):
@@ -615,6 +774,171 @@ def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256):
     }
 
 
+def load_tree_payload(path, index=0):
+    with Path(path).open("rb") as f:
+        payload = pickle.load(f)
+    trees = payload["trees"] if isinstance(payload, dict) else payload
+    if not isinstance(trees, (list, tuple)):
+        trees = [trees]
+    return payload, trees[index]
+
+
+def save_tree_payload(path, payload):
+    path = Path(path)
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("wb") as f:
+        pickle.dump(payload, f)
+
+
+def slice_tree_payload(path, output_path, *, term=0, target_slices=2, max_repeats=64, seed=42):
+    payload, tree = load_tree_payload(path, index=term)
+    original_costs = contraction_tree_costs(tree)
+    sliced_tree = tree.slice(
+        target_slices=target_slices,
+        max_repeats=max_repeats,
+        seed=seed,
+    )
+    sliced_costs = contraction_tree_costs(sliced_tree)
+
+    if isinstance(payload, dict):
+        out_payload = dict(payload)
+        trees = payload["trees"] if isinstance(payload["trees"], (list, tuple)) else [payload["trees"]]
+        new_trees = list(trees)
+        new_trees[term] = sliced_tree
+        out_payload["trees"] = new_trees
+        out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees]
+        out_payload["nterms"] = len(new_trees)
+    else:
+        trees = payload if isinstance(payload, (list, tuple)) else [payload]
+        new_trees = list(trees)
+        new_trees[term] = sliced_tree
+        out_payload = new_trees
+
+    save_tree_payload(output_path, out_payload)
+    return TreePayloadSliceResult(
+        payload=payload,
+        tree=tree,
+        sliced_tree=sliced_tree,
+        original_costs=original_costs,
+        sliced_costs=sliced_costs,
+    )
+
+
+def _prod(values):
+    out = 1
+    for value in values:
+        out *= int(value)
+    return out
+
+
+def _broadcast_batch(a_batch, b_batch):
+    if a_batch == b_batch:
+        return _prod(a_batch)
+    if not a_batch:
+        return _prod(b_batch)
+    if not b_batch:
+        return _prod(a_batch)
+    ndim = max(len(a_batch), len(b_batch))
+    a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch)
+    b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch)
+    return _prod(max(a, b) for a, b in zip(a_batch, b_batch))
+
+
+def analyze_contraction_tree(tree):
+    contract_mod = importlib.import_module("cotengra.contract")
+    contractions = contract_mod.extract_contractions(tree)
+    size_dict = tree.size_dict
+    ops = []
+    counts = Counter()
+
+    for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions):
+        if left is None and right is None:
+            counts["preprocess"] += 1
+            continue
+
+        left_inds = tree.get_inds(left)
+        right_inds = tree.get_inds(right)
+        parent_inds = tree.get_inds(parent)
+        left_shape = tuple(size_dict[ix] for ix in left_inds)
+        right_shape = tuple(size_dict[ix] for ix in right_inds)
+
+        if tdot:
+            parsed = contract_mod._parse_tensordot_axes_to_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+        else:
+            parsed = contract_mod._parse_eq_to_batch_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+
+        (
+            _eq_a,
+            _eq_b,
+            new_shape_a,
+            new_shape_b,
+            _new_shape_ab,
+            _perm_ab,
+            pure_multiplication,
+        ) = parsed
+
+        matmul_shape = None
+        matmul_flops = 0
+        if pure_multiplication:
+            kind = "mul"
+        else:
+            a_shape = tuple(new_shape_a or left_shape)
+            b_shape = tuple(new_shape_b or right_shape)
+            batch = _broadcast_batch(a_shape[:-2], b_shape[:-2])
+            m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1])
+            kind = "mm" if batch == 1 else "bmm"
+            matmul_shape = (batch, m, k, n)
+            matmul_flops = batch * m * k * n
+
+        tree_flops = int(tree.get_flops(parent))
+        out_size = int(tree.get_size(parent))
+        ops.append(
+            ContractionOpInfo(
+                index=op_index,
+                kind=kind,
+                matmul_shape=matmul_shape,
+                matmul_flops=matmul_flops,
+                tree_flops=tree_flops,
+                out_size=out_size,
+                left_shape=left_shape,
+                right_shape=right_shape,
+                left_rank=len(left_inds),
+                right_rank=len(right_inds),
+                out_rank=len(parent_inds),
+                perm=perm,
+            )
+        )
+        counts[kind] += 1
+
+    nslices = int(getattr(tree, "multiplicity", 1))
+    per_slice_flops = sum(op.tree_flops for op in ops)
+    per_slice_write = sum(op.out_size for op in ops)
+    max_out = max((op.out_size for op in ops), default=0)
+    dtype_bytes = 16
+    return TreeInspectionResult(
+        tree=tree,
+        contractions=tuple(contractions),
+        operations=tuple(ops),
+        counts=dict(counts),
+        nslices=nslices,
+        per_slice_flops=per_slice_flops,
+        per_slice_write=per_slice_write,
+        max_output_size=max_out,
+        all_slice_flops=per_slice_flops * nslices,
+        all_slice_write=per_slice_write * nslices,
+        dtype_bytes=dtype_bytes,
+        max_output_gib=max_out * dtype_bytes / 1024**3,
+    )
+
+
 @dataclass(frozen=True)
 class SlicePlan:
     """Slice ownership for one MPI rank."""
@@ -637,6 +961,49 @@ class SlicedContractStats:
     assignment: str
 
 
+@dataclass(frozen=True)
+class TreePayloadSliceResult:
+    """Result of slicing one tree stored in a tree payload."""
+
+    payload: object
+    tree: object
+    sliced_tree: object
+    original_costs: dict
+    sliced_costs: dict
+
+
+@dataclass(frozen=True)
+class ContractionOpInfo:
+    index: int
+    kind: str
+    matmul_shape: tuple | None
+    matmul_flops: int
+    tree_flops: int
+    out_size: int
+    left_shape: tuple
+    right_shape: tuple
+    left_rank: int
+    right_rank: int
+    out_rank: int
+    perm: object
+
+
+@dataclass(frozen=True)
+class TreeInspectionResult:
+    tree: object
+    contractions: tuple
+    operations: tuple
+    counts: dict
+    nslices: int
+    per_slice_flops: int
+    per_slice_write: int
+    max_output_size: int
+    all_slice_flops: int
+    all_slice_write: int
+    dtype_bytes: int
+    max_output_gib: float
+
+
 def mpi_slice_plan(nslices, rank, size, assignment="block"):
     """Return the contraction slice ids assigned to one MPI rank.
 
diff --git a/src/qibotn/torch_utils.py b/src/qibotn/torch_utils.py
new file mode 100644
index 0000000..98cd19c
--- /dev/null
+++ b/src/qibotn/torch_utils.py
@@ -0,0 +1,90 @@
+"""Shared torch helpers for qibotn CPU tensor-network code."""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def torch_dtype(dtype):
+    """Return the torch dtype used by qibotn complex CPU contractions."""
+    import torch
+
+    if dtype in ("complex64", "single", np.complex64):
+        return torch.complex64
+    return torch.complex128
+
+
+def numpy_dtype(dtype):
+    """Return the numpy dtype matching qibotn's complex dtype names."""
+    if dtype in ("complex64", "single", np.complex64):
+        return np.complex64
+    return np.complex128
+
+
+def torch_cpu_array(data, dtype=None):
+    """Convert array-like data to a contiguous CPU torch tensor.
+
+    ``torch.from_numpy`` rejects negative strides and read-only arrays in common
+    quimb paths, so this helper normalizes both cases before handing data to
+    torch.
+    """
+    import torch
+
+    if isinstance(data, torch.Tensor):
+        tensor = data
+    else:
+        array = np.asarray(data)
+        if any(stride < 0 for stride in array.strides):
+            array = np.ascontiguousarray(array)
+        elif not array.flags.writeable:
+            array = array.copy()
+        tensor = torch.from_numpy(array)
+
+    if tensor.device.type != "cpu":
+        tensor = tensor.cpu()
+    target_dtype = torch_dtype(dtype) if isinstance(dtype, str) else dtype
+    if target_dtype is not None and tensor.dtype != target_dtype:
+        tensor = tensor.to(target_dtype)
+    if not tensor.is_contiguous():
+        tensor = tensor.contiguous()
+    return tensor
+
+
+def arrays_to_torch(arrays, dtype="complex128"):
+    """Convert an iterable of arrays to CPU torch tensors."""
+    target_dtype = torch_dtype(dtype)
+    return [torch_cpu_array(array, dtype=target_dtype) for array in arrays]
+
+
+def arrays_to_numpy(arrays, dtype="complex128"):
+    """Convert an iterable of arrays to numpy arrays with qibotn dtype names."""
+    target_dtype = numpy_dtype(dtype)
+    return [np.asarray(array, dtype=target_dtype) for array in arrays]
+
+
+def arrays_to_backend(arrays, backend, engine=None, dtype="complex128"):
+    """Convert arrays to the backend representation used by quimb/cotengra."""
+    if backend == "torch":
+        return arrays_to_torch(arrays, dtype=dtype)
+    if engine is not None:
+        return [engine.asarray(array, dtype=numpy_dtype(dtype)) for array in arrays]
+    return arrays_to_numpy(arrays, dtype=dtype)
+
+
+def set_torch_threads(nthreads=None, interop_threads=None):
+    """Set torch CPU thread counts and return the active intra-op thread count."""
+    import torch
+
+    if nthreads is not None:
+        torch.set_num_threads(max(1, int(nthreads)))
+    if interop_threads is not None:
+        try:
+            torch.set_num_interop_threads(max(1, int(interop_threads)))
+        except RuntimeError:
+            pass
+    return torch.get_num_threads()
+
+
+def is_torch_array(value):
+    """Return whether *value* looks like a torch tensor without importing torch."""
+    return type(value).__module__.startswith("torch")
diff --git a/tests/test_cpu_backend.py b/tests/test_cpu_backend.py
index e5ea781..5041869 100644
--- a/tests/test_cpu_backend.py
+++ b/tests/test_cpu_backend.py
@@ -10,6 +10,11 @@ from qibotn.benchmark_cases import (
     exact_pauli_sum,
 )
 from qibotn import cpu_expectation, mps_expectation, pauli_pattern, pauli_sum
+from qibotn.backends.quimb import (
+    build_expectation_tn,
+    contract_tn,
+    search_contraction_tree,
+)
 
 
 def build_circuit(nqubits=6):
@@ -61,6 +66,31 @@ def test_public_cpu_expectation_api_matches_statevector():
     assert math.isclose(value, exact, abs_tol=1e-12)
 
 
+def test_public_quimb_torch_pipeline_matches_statevector():
+    circuit = build_circuit(nqubits=4)
+    observable = hamiltonians.SymbolicHamiltonian(form=X(0) * Z(1))
+    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
+
+    built = build_expectation_tn(
+        circuit,
+        observable,
+        dtype="complex128",
+        merge_1q=True,
+        merge_2q=True,
+    )
+    search = search_contraction_tree(
+        built.tn,
+        method="serial",
+        total_repeats=1,
+        max_time=30,
+        n_workers=1,
+        search_seed=0,
+    )
+    value = built.coeff * complex(contract_tn(built.tn, search.tree))
+
+    assert math.isclose(value.real, exact, abs_tol=1e-12)
+
+
 def test_public_mps_expectation_api_accepts_pauli_pattern():
     circuit = build_circuit()
     exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
diff --git a/tools/README.md b/tools/README.md
deleted file mode 100644
index 284a712..0000000
--- a/tools/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Tools
-
-Auxiliary scripts for profiling, legacy comparisons, and scale probes.
-
-The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`.
-For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`.
-
-Files here are intentionally secondary:
-
-- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea.
-- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path.
-- `run_cpu_single_cases.sh`: single-node scale probes.
-- `run_cpu_large_cases.sh`: two-node MPI scale probes.
-- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI.
-- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands.
-- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
-- `qibojit_reference_expectation.py`: state-vector reference helper.
-- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.
-- `mpi_torch_thread_probe.py`: MPI + torch OpenMP affinity and threading probe.
diff --git a/tools/baseline_mps_expectation.py b/tools/baseline_mps_expectation.py
deleted file mode 100644
index ef12ae3..0000000
--- a/tools/baseline_mps_expectation.py
+++ /dev/null
@@ -1,201 +0,0 @@
-"""MPS expectation benchmark for qmatchatea and Vidal backends."""
-
-import argparse
-import json
-import logging
-import os
-import socket
-import time
-
-import numpy as np
-
-from qibotn.benchmark_cases import (
-    build_circuit as build_benchmark_circuit,
-    exact_pauli_sum,
-    observable_terms,
-    terms_to_dict,
-)
-from qibotn.backends.qmatchatea import QMatchaTeaBackend
-from qibotn.backends.vidal_tebd import run_vidal_ring_xz
-
-
-def optional_int(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return int(text)
-
-
-def optional_float(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return float(text)
-
-
-def format_optional(value, fmt="g"):
-    return "None" if value is None else format(value, fmt)
-
-
-def build_circuit(nqubits, nlayers, seed):
-    return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
-
-
-def build_observable(nqubits):
-    return terms_to_dict(observable_terms("ring_xz", nqubits))
-
-
-def exact_expectation(circuit, nqubits):
-    return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=40)
-    parser.add_argument("--nlayers", type=int, default=30)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=512)
-    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
-    parser.add_argument("--torch-threads", type=int, default=32)
-    parser.add_argument(
-        "--executor",
-        choices=("qmatchatea", "vidal", "vidal-mpi"),
-        default="qmatchatea",
-    )
-    parser.add_argument("--mpi-ct", action="store_true")
-    parser.add_argument("--mpi-barriers", type=int, default=-1)
-    parser.add_argument("--mpi-isometrization", type=int, default=-1)
-    parser.add_argument("--exact", action="store_true")
-    parser.add_argument("--exact-max-qubits", type=int, default=24)
-    parser.add_argument("--reference-file")
-    parser.add_argument(
-        "--mpi-rank-map",
-        action="store_true",
-        help="Print MPI rank, host, pid, and torch thread placement metadata.",
-    )
-    args = parser.parse_args()
-    logging.getLogger("qibo.config").setLevel(logging.ERROR)
-    logging.getLogger("qtealeaves").setLevel(logging.ERROR)
-    import torch
-
-    torch.set_num_threads(args.torch_threads)
-    rank = 0
-    size = 1
-    if args.mpi_ct:
-        from mpi4py import MPI
-
-        rank = MPI.COMM_WORLD.Get_rank()
-        size = MPI.COMM_WORLD.Get_size()
-        if args.mpi_rank_map:
-            rank_info = {
-                "rank": rank,
-                "size": size,
-                "host": socket.gethostname(),
-                "pid": os.getpid(),
-                "torch_threads": args.torch_threads,
-                "omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""),
-                "mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""),
-            }
-            rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0)
-            if rank == 0:
-                print("mpi_rank_map")
-                for item in sorted(rank_infos, key=lambda row: row["rank"]):
-                    print(
-                        "rank={rank} size={size} host={host} pid={pid} "
-                        "torch_threads={torch_threads} "
-                        "OMP_NUM_THREADS={omp_num_threads} "
-                        "MKL_NUM_THREADS={mkl_num_threads}".format(**item)
-                    )
-
-    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
-    observable = build_observable(args.nqubits)
-    exact = None
-    if args.reference_file:
-        with open(args.reference_file, "r", encoding="utf-8") as f:
-            exact = float(json.load(f)["expectation"])
-    elif args.exact:
-        if args.nqubits > args.exact_max_qubits:
-            raise ValueError(
-                f"--exact is limited to {args.exact_max_qubits} qubits by default."
-            )
-        exact = exact_expectation(circuit, args.nqubits)
-
-    if rank == 0:
-        if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"):
-            mpi_label = f"VidalSegment/{size}"
-        else:
-            mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
-        print(
-            f"nqubits={args.nqubits} nlayers={args.nlayers} "
-            f"bond={format_optional(args.bond)} "
-            f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
-            f"tensor_module={args.tensor_module} svd_control=E! "
-            f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
-        )
-        if exact is not None:
-            print(f"exact={exact:.16e}")
-        print("expval abs_error rel_error seconds")
-
-    start = time.perf_counter()
-    timings = None
-    if args.executor in ("vidal", "vidal-mpi"):
-        if args.executor == "vidal-mpi" and not args.mpi_ct:
-            raise ValueError("--executor vidal-mpi requires --mpi-ct.")
-        if args.mpi_ct:
-            from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
-
-            value, timings = run_segment_vidal_mpi_ring_xz(
-                circuit,
-                max_bond=args.bond,
-                cut_ratio=args.cut_ratio,
-                tensor_module=args.tensor_module,
-                comm=MPI.COMM_WORLD,
-            )
-        else:
-            value = run_vidal_ring_xz(
-                circuit,
-                max_bond=args.bond,
-                cut_ratio=args.cut_ratio,
-                tensor_module=args.tensor_module,
-            )
-    else:
-        backend = QMatchaTeaBackend()
-        backend.configure_tn_simulation(
-            ansatz="MPS",
-            max_bond_dimension=args.bond,
-            cut_ratio=args.cut_ratio,
-            svd_control="E!",
-            tensor_module=args.tensor_module,
-            compile_circuit=True,
-            track_memory=False,
-            mpi_approach="CT" if args.mpi_ct else "SR",
-            mpi_num_procs=size,
-            mpi_where_barriers=args.mpi_barriers if args.mpi_ct else -1,
-            mpi_isometrization=args.mpi_isometrization,
-        )
-        value = backend.expectation(
-            circuit,
-            observable,
-            preprocess=False,
-            compile_circuit=True,
-        )
-    max_timings = None
-    if timings:
-        max_timings = {
-            key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0)
-            for key, local_value in timings.items()
-        }
-    if rank != 0:
-        return
-    value = float(np.real(value))
-    elapsed = time.perf_counter() - start
-    abs_error = float("nan") if exact is None else abs(value - exact)
-    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
-    print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}")
-    if max_timings:
-        print("timing_section max_seconds")
-        for key, max_value in max_timings.items():
-            print(f"{key} {max_value:.6f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/benchmark_contract_sliced.py b/tools/benchmark_contract_sliced.py
deleted file mode 100644
index a089546..0000000
--- a/tools/benchmark_contract_sliced.py
+++ /dev/null
@@ -1,56 +0,0 @@
-"""MPI parallel sliced contraction using pre-sliced tree."""
-import time, pickle, os
-import numpy as np
-from mpi4py import MPI
-
-NQUBITS, NLAYERS, NCORES = 25, 10, 48
-
-comm = MPI.COMM_WORLD
-rank, size = comm.Get_rank(), comm.Get_size()
-
-os.environ['OMP_NUM_THREADS'] = str(NCORES)
-os.environ['MKL_NUM_THREADS'] = str(NCORES)
-
-import torch
-import qibo, quimb as qu
-from qibotn.observables import build_random_circuit
-
-torch.set_num_threads(NCORES)
-
-circuit = build_random_circuit(NQUBITS, NLAYERS)
-qibo.set_backend("qibotn", platform="quimb")
-backend = qibo.get_backend()
-backend.configure_tn_simulation(ansatz="tn")
-qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
-tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
-
-if rank == 0:
-    with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'rb') as f:
-        tree = pickle.load(f)
-else:
-    tree = None
-tree = comm.bcast(tree, root=0)
-
-arrays = [torch.from_numpy(np.asarray(t._data)) for t in tn.tensors]
-n_slices = tree.multiplicity
-
-if rank == 0:
-    print(f"Slices: {n_slices}, Ranks: {size}, "
-          f"Peak: {tree.max_size() * 16 / 1e9:.2f} GB, "
-          f"Threads/rank: {NCORES}, Backend: torch")
-
-t0 = time.time()
-result = None
-for i in range(rank, n_slices, size):
-    val = tree.contract_slice(arrays, i, backend='torch')
-    val_np = val.cpu().numpy().reshape(-1)
-    result = val_np if result is None else result + val_np
-
-if result is None:
-    result = np.zeros(1, dtype=np.complex128)
-
-total = np.zeros_like(result) if rank == 0 else None
-comm.Reduce(result, total, root=0)
-
-if rank == 0:
-    print(f"Contract: {time.time() - t0:.4f}s  Expectation: {0.5 * total[0].real:.10f}")
diff --git a/tools/benchmark_qredtea_svd_controls.py b/tools/benchmark_qredtea_svd_controls.py
deleted file mode 100644
index 4111c48..0000000
--- a/tools/benchmark_qredtea_svd_controls.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python
-"""Benchmark qredtea/qtealeaves SVD control modes.
-
-This isolates the tensor split used by MPS updates: a rank-2 tensor is split
-with singular values contracted either left or right, then reconstructed to
-measure numerical error and timing.
-"""
-
-from __future__ import annotations
-
-import argparse
-import gc
-import statistics
-import time
-
-import torch
-
-import qmatchatea
-from qredtea.torchapi import QteaTorchTensor
-
-
-def _dtype(name: str):
-    return {
-        "complex64": torch.complex64,
-        "complex128": torch.complex128,
-        "float64": torch.float64,
-        "float32": torch.float32,
-    }[name]
-
-
-def _random_matrix(shape, dtype, seed):
-    gen = torch.Generator(device="cpu")
-    gen.manual_seed(seed)
-    if dtype.is_complex:
-        real_dtype = torch.float32 if dtype == torch.complex64 else torch.float64
-        real = torch.randn(shape, dtype=real_dtype, generator=gen)
-        imag = torch.randn(shape, dtype=real_dtype, generator=gen)
-        return torch.complex(real, imag).to(dtype)
-    return torch.randn(shape, dtype=dtype, generator=gen)
-
-
-def _sync():
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-
-
-def run_one(matrix, ctrl, max_bond, contract_singvals, repeats):
-    conv = qmatchatea.QCConvergenceParameters(
-        max_bond_dimension=max_bond,
-        cut_ratio=0.0,
-        svd_ctrl=ctrl,
-    )
-    qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
-
-    times = []
-    rel_error = None
-    kept = None
-    status = "ok"
-    error = ""
-
-    for i in range(repeats):
-        gc.collect()
-        _sync()
-        t0 = time.perf_counter()
-        try:
-            left, right, singvals, _ = qtensor.split_svd(
-                [0],
-                [1],
-                contract_singvals=contract_singvals,
-                conv_params=conv,
-            )
-        except Exception as exc:  # noqa: BLE001 - benchmark should keep going
-            status = "error"
-            error = repr(exc)
-            break
-        _sync()
-        times.append(time.perf_counter() - t0)
-
-        if i == repeats - 1:
-            left_matrix = left.elem.reshape(matrix.shape[0], -1)
-            right_matrix = right.elem.reshape(-1, matrix.shape[1])
-            recon = left_matrix @ right_matrix
-            rel_error = (
-                torch.linalg.vector_norm(matrix - recon)
-                / torch.linalg.vector_norm(matrix)
-            ).item()
-            kept = int(singvals.numel())
-
-    return {
-        "ctrl": ctrl,
-        "contract_singvals": contract_singvals,
-        "status": status,
-        "median_ms": float("nan") if not times else statistics.median(times) * 1000,
-        "min_ms": float("nan") if not times else min(times) * 1000,
-        "rel_error": rel_error,
-        "kept": kept,
-        "error": error,
-    }
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--shapes", nargs="+", default=("256x1024", "1024x256", "512x512"))
-    parser.add_argument("--max-bond", type=int, default=128)
-    parser.add_argument("--dtype", choices=("complex64", "complex128", "float32", "float64"), default="complex128")
-    parser.add_argument("--threads", type=int, default=8)
-    parser.add_argument("--repeats", type=int, default=3)
-    parser.add_argument(
-        "--controls",
-        nargs="+",
-        default=("A", "D", "V", "R", "E", "E!", "X", "X!"),
-    )
-    args = parser.parse_args()
-
-    torch.set_num_threads(args.threads)
-    dtype = _dtype(args.dtype)
-
-    print(
-        "svd_benchmark "
-        f"dtype={args.dtype} threads={torch.get_num_threads()} "
-        f"max_bond={args.max_bond} repeats={args.repeats}",
-        flush=True,
-    )
-    print(
-        "columns shape contract ctrl status median_ms min_ms kept rel_error error",
-        flush=True,
-    )
-
-    for shape_text in args.shapes:
-        m_text, n_text = shape_text.lower().split("x", 1)
-        shape = (int(m_text), int(n_text))
-        matrix = _random_matrix(shape, dtype, seed=sum(shape))
-        for contract_singvals in ("L", "R"):
-            for ctrl in args.controls:
-                result = run_one(
-                    matrix,
-                    ctrl=ctrl,
-                    max_bond=args.max_bond,
-                    contract_singvals=contract_singvals,
-                    repeats=args.repeats,
-                )
-                print(
-                    f"row shape={shape_text} "
-                    f"contract={contract_singvals} "
-                    f"ctrl={ctrl} "
-                    f"status={result['status']} "
-                    f"median_ms={result['median_ms']:.3f} "
-                    f"min_ms={result['min_ms']:.3f} "
-                    f"kept={result['kept']} "
-                    f"rel_error={result['rel_error']} "
-                    f"error={result['error']}",
-                    flush=True,
-                )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/benchmark_search.py b/tools/benchmark_search.py
deleted file mode 100644
index f0bc464..0000000
--- a/tools/benchmark_search.py
+++ /dev/null
@@ -1,34 +0,0 @@
-"""Search contraction path and save."""
-import time, os, pickle
-from qibotn.parallel import parallel_path_search
-from qibotn.observables import build_random_circuit
-import qibo, quimb as qu
-
-from mpi4py import MPI
-
-NQUBITS, NLAYERS, WORKERS = 20, 10, 96
-
-comm = MPI.COMM_WORLD
-rank, size = comm.Get_rank(), comm.Get_size()
-method = 'mpi' if size > 1 else 'processpool'
-
-circuit = build_random_circuit(NQUBITS, NLAYERS)
-qibo.set_backend("qibotn", platform="quimb")
-backend = qibo.get_backend()
-backend.configure_tn_simulation(ansatz="tn")
-qc = backend._qibo_circuit_to_quimb(circuit, backend.circuit_ansatz)
-tn = qc.local_expectation(qu.pauli('x') & qu.pauli('z'), (0, 1), rehearse='tn')
-
-if rank == 0:
-    print(f"Searching {NQUBITS}q {NLAYERS}l, method={method}, ranks={size}, workers/rank={WORKERS}...")
-t0 = time.time()
-tree = parallel_path_search(tn, tn.outer_inds(), method=method,
-    total_repeats=1024, max_time=300, n_workers=WORKERS,trial_timeout=60)
-t_search = time.time() - t0
-
-if rank == 0:
-    os.makedirs('data', exist_ok=True)
-    path = f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl"
-    with open(path, 'wb') as f:
-        pickle.dump(tree, f)
-    print(f"Search: {t_search:.2f}s  Peak: {tree.max_size() * 16 / 1e9:.2f} GB  Saved: {path}")
diff --git a/tools/benchmark_slice.py b/tools/benchmark_slice.py
deleted file mode 100644
index b398857..0000000
--- a/tools/benchmark_slice.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""Slice saved tree and save."""
-import pickle
-
-NQUBITS, NLAYERS = 25, 10
-
-with open(f"data/tree_q{NQUBITS}_l{NLAYERS}.pkl", 'rb') as f:
-    tree = pickle.load(f)
-
-print(f"Original peak: {tree.max_size() * 16 / 1e9:.2f} GB")
-
-tree_sliced = tree.slice_and_reconfigure(target_size=2**28)
-
-with open(f"data/tree_q{NQUBITS}_l{NLAYERS}_sliced.pkl", 'wb') as f:
-    pickle.dump(tree_sliced, f)
-
-print(f"Sliced peak: {tree_sliced.max_size() * 16 / 1e9:.2f} GB  Slices: {tree_sliced.multiplicity}")
diff --git a/tools/benchmark_tn_mpi.py b/tools/benchmark_tn_mpi.py
deleted file mode 100644
index 8dc80d1..0000000
--- a/tools/benchmark_tn_mpi.py
+++ /dev/null
@@ -1,378 +0,0 @@
-"""MPI-parallel TN benchmark: path search + contraction via MPI."""
-import json
-import pickle
-import time
-import argparse
-import numpy as np
-import cotengra as ctg
-import qibo
-from qibo import Circuit, gates
-from mpi4py import MPI
-from concurrent.futures import ProcessPoolExecutor, as_completed
-from qibotn.observables import check_observable, extract_gates_and_qubits
-
-
-def _load_observable(observable_file=None, observable_json=None):
-    if observable_file:
-        with open(observable_file, "r", encoding="utf8") as f:
-            return json.load(f)
-    if observable_json:
-        return json.loads(observable_json)
-    return None
-
-
-def _term_to_quimb_operator(term):
-    """Convert one extracted Hamiltonian term to a quimb operator."""
-    import quimb as qu
-
-    coeff = complex(term[0][2]) if term else 1.0
-    op = None
-    where = []
-
-    for qubit, gate_name, _ in term:
-        qubit = int(qubit)
-        gate_name = str(gate_name).upper()
-        if gate_name == "I":
-            continue
-        where.append(qubit)
-        op = qu.pauli(gate_name.lower()) if op is None else op & qu.pauli(gate_name.lower())
-
-    return complex(coeff), op, tuple(where)
-
-
-def _run_serial_search(tn_bytes, output_inds, repeats, seed, num_slices, n_ranks, max_time):
-    import pickle, cotengra as ctg, random
-    random.seed(seed)
-    tn = pickle.loads(tn_bytes)
-    opt = ctg.HyperOptimizer(
-        methods=['kahypar', 'kahypar-agglom', 'spinglass'],
-        max_repeats=repeats,
-        parallel=False,
-        minimize='combo-256',
-        max_time=max_time,
-        optlib="random",
-        slicing_opts={'target_size': 2**29, 'allow_outer': True},
-        progbar=False,
-    )
-    tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
-    return tree.combo_cost(factor=256), tree
-
-
-def parallel_search(tn, output_inds, total_repeats, n_workers, num_slices, n_ranks,
-                    timeout):
-    import pickle, os, signal
-    from concurrent.futures import ProcessPoolExecutor, as_completed
-    tn_bytes = pickle.dumps(tn)
-    if n_workers <= 1:
-        return _run_serial_search(
-            tn_bytes, output_inds, total_repeats, 0, num_slices, n_ranks, timeout
-        )[1]
-    repeats_per = max(1, total_repeats // n_workers)
-    best_cost, best_tree = float('inf'), None
-
-    pool = ProcessPoolExecutor(max_workers=n_workers)
-    futures = [
-        pool.submit(_run_serial_search, tn_bytes, output_inds,
-                    repeats_per, seed, num_slices, n_ranks, timeout)
-        for seed in range(n_workers)
-    ]
-    try:
-        for fut in as_completed(futures, timeout=timeout + 5):
-            try:
-                cost, tree = fut.result()
-                if cost < best_cost:
-                    best_cost, best_tree = cost, tree
-            except Exception as e:
-                print(f"  [worker failed] {e}")
-    except TimeoutError:
-        pass
-    finally:
-        for fut in futures:
-            fut.cancel()
-        for pid in list(pool._processes.keys()):
-            try:
-                os.kill(pid, signal.SIGKILL)
-            except ProcessLookupError:
-                pass
-        pool.shutdown(wait=False)
-
-    return best_tree
-
-
-def make_circuit(circuit_type, nqubits, nlayers=1):
-    c = Circuit(nqubits)
-    if circuit_type == "qft":
-        from qibo.models import QFT
-        return QFT(nqubits)
-    elif circuit_type == "variational":
-        for layer in range(nlayers):
-            for q in range(nqubits):
-                c.add(gates.RY(q, theta=np.random.uniform(0, 2 * np.pi)))
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CZ(q, q + 1))
-    elif circuit_type == "ghz":
-        c.add(gates.H(0))
-        for q in range(nqubits - 1):
-            c.add(gates.CNOT(q, q + 1))
-    elif circuit_type == "brickwork":
-        for q in range(nqubits):
-            c.add(gates.H(q))
-        for layer in range(nlayers):
-            offset = layer % 2
-            for q in range(offset, nqubits - 1, 2):
-                c.add(gates.CNOT(q, q + 1))
-                c.add(gates.RZ(q, theta=np.random.uniform(0, 2 * np.pi)))
-                c.add(gates.RZ(q + 1, theta=np.random.uniform(0, 2 * np.pi)))
-    else:
-        raise ValueError(f"Unknown circuit: {circuit_type}")
-    return c
-
-
-def _contract_mpi(tree, arrays, comm, root=0):
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-    is_torch = type(arrays[0]).__module__.startswith("torch")
-
-    result_np = None
-    for i in range(rank, tree.multiplicity, size):
-        x = tree.contract_slice(arrays, i)
-        x_np = np.asfortranarray(x.detach().cpu().numpy() if is_torch else np.asarray(x))
-        result_np = x_np if result_np is None else result_np + x_np
-
-    if result_np is None:
-        result_np = np.zeros(1, dtype=np.complex128)
-
-    result = np.zeros_like(result_np) if rank == root else None
-    comm.Reduce(result_np, result, root=root)
-
-    if rank == root:
-        import torch
-        return torch.from_numpy(np.asarray(result)) if is_torch else result
-    return None
-
-
-def run_mpi(circuit, nqubits, num_slices, total_repeats=1024,
-            load_path=None, save_path=None):
-    """Each MPI rank runs serial path search over total_repeats/size trials,
-    rank 0 picks the global best, then all ranks contract in parallel."""
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    import torch
-    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
-                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
-    qc.to_backend = lambda x: torch.from_numpy(x).to(torch.complex128)
-
-    # --- path search: each rank serial, gather best to rank 0 ---
-    if load_path:
-        if rank == 0:
-            with open(load_path, "rb") as f:
-                saved = pickle.load(f)
-            tree, psi, t_search = saved["tree"], saved["psi"], 0.0
-            print(f"  [path loaded]  {load_path}")
-        else:
-            tree = psi = None
-            t_search = 0.0
-    else:
-        rank_repeats = max(1, total_repeats // size)
-        t0 = time.time()
-        # get TN object first (no contraction), then run parallel search
-        psi_tn = qc.to_dense(rehearse="tn")
-        local_tree = parallel_search(
-            psi_tn, psi_tn.outer_inds(), rank_repeats, n_workers=48,
-            num_slices=num_slices, n_ranks=size, timeout=600,
-        )
-        t_search = time.time() - t0
-        local_psi = psi_tn
-
-        all_results = comm.gather((local_tree.combo_cost(factor=256), local_tree, local_psi), root=0)
-        if rank == 0:
-            _, tree, psi = min(all_results, key=lambda x: x[0])
-            print(f"  [path search]  {t_search:.3f}s  "
-                  f"flops~2^{tree.contraction_cost(log=2):.2f}  "
-                  f"size~2^{tree.contraction_width():.2f}  "
-                  f"slices={tree.multiplicity}")
-            if save_path:
-                with open(save_path, "wb") as f:
-                    pickle.dump({"tree": tree, "psi": psi}, f)
-                print(f"  [path saved]   {save_path}")
-        else:
-            tree = psi = None
-
-        if save_path:
-            t_search = comm.bcast(t_search, root=0)
-            return None, t_search
-
-    tree = comm.bcast(tree, root=0)
-    psi = comm.bcast(psi, root=0)
-    t_search = comm.bcast(t_search, root=0)
-
-    # --- contraction: all ranks work in parallel ---
-    import torch
-    torch.set_num_threads(max(1, 96 // size))
-    arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in psi.arrays]
-    t0 = time.time()
-    sv = _contract_mpi(tree, arrays, comm, root=0)
-    t_contract = time.time() - t0
-
-    if rank == 0:
-        print(f"  [contraction]  {t_contract:.3f}s")
-        return np.array(sv).reshape(-1), t_search + t_contract
-    return None, t_search + t_contract
-
-
-def run_mpi_expval(
-    circuit,
-    nqubits,
-    observable=None,
-    total_repeats=1024,
-    search_workers=1,
-    search_timeout=300,
-):
-    """Compute a Hamiltonian expectation value directly from TN via MPI.
-    MPI parallelizes over Hamiltonian terms; ProcessPool optionally helps
-    path search for each term."""
-    import torch
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    qibo.set_backend("qibotn", platform="quimb")
-    b = qibo.get_backend()
-    b.configure_tn_simulation(ansatz="tn")
-
-    observable = check_observable(observable, nqubits)
-    ham_gate_map = extract_gates_and_qubits(observable)
-
-    qc = b._qibo_circuit_to_quimb(circuit, quimb_circuit_type=b.circuit_ansatz,
-                                   gate_opts={"max_bond": None, "cutoff": 1e-10})
-
-    my_terms = ham_gate_map[rank::size]
-    torch.set_num_threads(max(1, 96 // size))
-    t0 = time.time()
-
-    my_exp = 0.0 + 0.0j
-    for term in my_terms:
-        coeff, op, where = _term_to_quimb_operator(term)
-        if op is None:
-            my_exp += coeff
-            continue
-        tn = qc.local_expectation_tn(op, where=where)
-        if len(tn.outer_inds()) == 0:
-            val = complex(tn.contract())
-        else:
-            tree = parallel_search(
-                tn,
-                tn.outer_inds(),
-                total_repeats,
-                n_workers=search_workers,
-                num_slices=1,
-                n_ranks=size,
-                timeout=search_timeout,
-            )
-            if tree is None:
-                raise RuntimeError("Failed to find a contraction tree for expectation TN.")
-            arrays = [torch.from_numpy(np.asarray(a)).to(torch.complex128) for a in tn.arrays]
-            acc = sum(tree.contract_slice(arrays, i) for i in range(tree.multiplicity))
-            val = complex(acc.item() if hasattr(acc, 'item') else acc)
-        my_exp += coeff * val
-
-    t_total = time.time() - t0
-
-    all_results = comm.gather(my_exp, root=0)
-    if rank == 0:
-        total_exp = sum(all_results)
-        print(f"\n[TN expval]  time={t_total:.4f}s  expval={total_exp.real:.12f}")
-        return np.real_if_close(total_exp), t_total
-    return None, t_total
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=30)
-    parser.add_argument("--circuit", type=str, default="qft",
-                        choices=["qft", "variational", "ghz", "brickwork"])
-    parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--num-slices", type=int, default=1)
-    parser.add_argument("--total-repeats", type=int, default=1024)
-    parser.add_argument("--search-workers", type=int, default=1)
-    parser.add_argument("--search-timeout", type=int, default=300)
-    parser.add_argument("--observable-file", type=str, default=None)
-    parser.add_argument("--observable-json", type=str, default=None)
-    parser.add_argument("--save-path", type=str, default=None)
-    parser.add_argument("--load-path", type=str, default=None)
-    parser.add_argument("--no-compare", action="store_true")
-    parser.add_argument("--mode", type=str, default="sv", choices=["sv", "expval"])
-    args = parser.parse_args()
-
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-
-    if rank == 0:
-        print(f"Circuit: {args.circuit}, nqubits={args.nqubits}, "
-              f"nlayers={args.nlayers}, ranks={comm.Get_size()}")
-
-    np.random.seed(42)
-    circuit = make_circuit(args.circuit, args.nqubits, args.nlayers)
-    observable = _load_observable(args.observable_file, args.observable_json)
-
-    if args.mode == "expval":
-        try:
-            expval, t_total = run_mpi_expval(
-                circuit,
-                args.nqubits,
-                observable=observable,
-                total_repeats=args.total_repeats,
-                search_workers=args.search_workers,
-                search_timeout=args.search_timeout,
-            )
-        except Exception as e:
-            if rank == 0:
-                print(f"[FAILED] {e}")
-            raise
-        if rank == 0:
-            np.save(f"data/expval_tn_{args.circuit}{args.nqubits}.npy", np.asarray(expval))
-            if not args.no_compare:
-                print("No built-in reference comparison for arbitrary observables.")
-        return
-
-    try:
-        sv, t_total = run_mpi(circuit, args.nqubits, args.num_slices,
-                              total_repeats=args.total_repeats,
-                              load_path=args.load_path, save_path=args.save_path)
-    except Exception as e:
-        if rank == 0:
-            print(f"[FAILED] {e}")
-        raise
-
-    if rank == 0 and sv is not None:
-        print(f"\n[quimb TN MPI]  time={t_total:.4f}s  shape={sv.shape}")
-        np.save(f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy", sv)
-
-        if not args.no_compare:
-            from qibotn.bak.benchmark_tn import run_qibojit
-            import gc
-            np.random.seed(42)
-            circuit_ref = make_circuit(args.circuit, args.nqubits, args.nlayers)
-            sv_ref, t_ref = run_qibojit(circuit_ref)
-            np.save(f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy", sv_ref)
-            print(f"[qibojit]       time={t_ref:.4f}s")
-            # free memory before loading via mmap for expval comparison
-            del sv, sv_ref
-            gc.collect()
-            from compare_jit_tn_quimb import check_results
-            ref_path = f"data/sv_qibojit_{args.circuit}{args.nqubits}.npy"
-            tn_path  = f"data/sv_tn_{args.circuit}{args.nqubits}_mpi.npy"
-            check_results(ref_path, tn_path, args.nqubits)
-            if t_total > 0:
-                print(f"Speedup  : {t_ref/t_total:.2f}x")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/check_tree.py b/tools/check_tree.py
deleted file mode 100644
index 935f952..0000000
--- a/tools/check_tree.py
+++ /dev/null
@@ -1,25 +0,0 @@
-"""Check contraction tree statistics."""
-import pickle, sys
-
-path = sys.argv[1] if len(sys.argv) > 1 else "data/tree_q25_l10.pkl"
-with open(path, 'rb') as f:
-    tree = pickle.load(f)
-
-# Intel 8558P: 96 cores, 2.1GHz, AVX-512 (16 FP64/cycle), FMA x2
-# complex128 multiply-add = 6 real FLOPs
-CORES = 96
-FREQ = 2.1e9
-AVX512_FP64 = 16
-TFLOPS = CORES * FREQ * AVX512_FP64 * 2 / 1e12  # ~6.45 TFLOPS real FP64
-COMPLEX_FLOPS = TFLOPS / 6  # complex128 effective
-
-flops = tree.total_flops()
-slices = tree.multiplicity
-est_seconds = flops * slices / (COMPLEX_FLOPS * 1e12) 
-
-print(f"File: {path}")
-print(f"Peak memory (GB): {tree.max_size() * 16 / 1e9:.2f}")
-print(f"Total FLOPs: {flops:.2e}  x{slices} slices = {flops*slices:.2e}")
-print(f"Contraction width: {tree.contraction_width()}")
-print(f"Multiplicity (slices): {slices}")
-print(f"Estimated time (96 cores): {est_seconds:.1f}s  ({est_seconds/3600:.2f}h)")
diff --git a/tools/compare_vidal_backend_qmatchatea.py b/tools/compare_vidal_backend_qmatchatea.py
deleted file mode 100644
index b5050cf..0000000
--- a/tools/compare_vidal_backend_qmatchatea.py
+++ /dev/null
@@ -1,137 +0,0 @@
-"""Compare QMatchaTeaBackend with the VidalBackend fast path."""
-
-from __future__ import annotations
-
-import argparse
-import json
-import math
-import time
-
-import numpy as np
-import torch
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Y, Z
-
-from qibotn.backends.qmatchatea import QMatchaTeaBackend
-from qibotn.backends.vidal import VidalBackend
-
-
-def build_circuit(nqubits, nlayers, seed, kind):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-    for layer in range(nlayers):
-        for q in range(nqubits):
-            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
-        if kind == "brickwall":
-            for q in range(0, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q, q + 1))
-            for q in range(1, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q, q + 1))
-        elif kind == "shifted-cz":
-            for q in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.CZ(q, q + 1))
-        elif kind == "reversed-cnot":
-            for q in range(0, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q + 1, q))
-            for q in range(1, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q, q + 1))
-        else:
-            raise ValueError(f"Unknown circuit kind {kind!r}.")
-    return circuit
-
-
-def build_observable(nqubits, kind):
-    form = 0
-    if kind == "ring-xz":
-        for q in range(nqubits):
-            form += 0.5 * X(q) * Z((q + 1) % nqubits)
-    elif kind == "open-zz":
-        for q in range(nqubits - 1):
-            form += Z(q) * Z(q + 1) / (nqubits - 1)
-    elif kind == "mixed":
-        form += 0.25 * X(0) - 0.5 * Z(nqubits - 1)
-        for q in range(0, nqubits - 1, 3):
-            form += 0.125 * Y(q) * Y(q + 1)
-    else:
-        raise ValueError(f"Unknown observable kind {kind!r}.")
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def run_backend(backend, circuit, observable):
-    start = time.perf_counter()
-    value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True)
-    return float(np.real(value)), time.perf_counter() - start
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=34)
-    parser.add_argument("--nlayers", type=int, default=20)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
-    parser.add_argument("--torch-threads", type=int, default=32)
-    parser.add_argument(
-        "--circuit-kind",
-        choices=("brickwall", "shifted-cz", "reversed-cnot"),
-        default="brickwall",
-    )
-    parser.add_argument(
-        "--observable-kind",
-        choices=("ring-xz", "open-zz", "mixed"),
-        default="ring-xz",
-    )
-    parser.add_argument("--reference-file")
-    parser.add_argument("--skip-qmatchatea", action="store_true")
-    args = parser.parse_args()
-
-    torch.set_num_threads(args.torch_threads)
-    circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind)
-    observable = build_observable(args.nqubits, args.observable_kind)
-
-    exact = None
-    if args.reference_file:
-        with open(args.reference_file, "r", encoding="utf-8") as f:
-            exact = float(json.load(f)["expectation"])
-
-    print(
-        f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} "
-        f"circuit={args.circuit_kind} observable={args.observable_kind} "
-        f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}"
-    )
-    if exact is not None:
-        print(f"exact={exact:.16e}")
-    print("backend value abs_error seconds")
-
-    if not args.skip_qmatchatea:
-        qmt = QMatchaTeaBackend()
-        qmt.configure_tn_simulation(
-            ansatz="MPS",
-            max_bond_dimension=args.bond,
-            cut_ratio=1e-12,
-            svd_control="E!",
-            tensor_module=args.tensor_module,
-            compile_circuit=True,
-            track_memory=False,
-        )
-        value, seconds = run_backend(qmt, circuit, observable)
-        error = float("nan") if exact is None else abs(value - exact)
-        print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}")
-
-    vidal = VidalBackend()
-    vidal.configure_tn_simulation(
-        ansatz="MPS",
-        max_bond_dimension=args.bond,
-        cut_ratio=1e-12,
-        tensor_module=args.tensor_module,
-        compile_circuit=True,
-        fallback=True,
-    )
-    value, seconds = run_backend(vidal, circuit, observable)
-    error = float("nan") if exact is None else abs(value - exact)
-    print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/example_tn_case.py b/tools/example_tn_case.py
deleted file mode 100644
index c35f057..0000000
--- a/tools/example_tn_case.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""Example custom case for tools/run_tn_custom.py."""
-
-from __future__ import annotations
-
-import math
-
-import numpy as np
-from qibo import Circuit, gates
-
-
-def build_circuit(nqubits, nlayers, seed):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-    for layer in range(nlayers):
-        for qubit in range(nqubits):
-            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        for qubit in range(layer % 2, nqubits - 1, 2):
-            circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
-            circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
-    return circuit
-
-
-def build_observable(nqubits, seed):
-    return {
-        "terms": [
-            {
-                "coefficient": 1.0 / max(1, nqubits - 1),
-                "operators": [("Z", site), ("Z", site + 1)],
-            }
-            for site in range(nqubits - 1)
-        ]
-    }
diff --git a/tools/inspect_contraction_tree.py b/tools/inspect_contraction_tree.py
deleted file mode 100644
index a6422ba..0000000
--- a/tools/inspect_contraction_tree.py
+++ /dev/null
@@ -1,208 +0,0 @@
-"""Inspect cotengra contraction trees for dominant torch matmul shapes."""
-
-from __future__ import annotations
-
-import argparse
-import importlib
-import math
-import pickle
-from collections import Counter, defaultdict
-from pathlib import Path
-
-
-def _prod(values):
-    out = 1
-    for value in values:
-        out *= int(value)
-    return out
-
-
-def _broadcast_batch(a_batch, b_batch):
-    if a_batch == b_batch:
-        return _prod(a_batch)
-    if not a_batch:
-        return _prod(b_batch)
-    if not b_batch:
-        return _prod(a_batch)
-
-    ndim = max(len(a_batch), len(b_batch))
-    a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch)
-    b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch)
-    return _prod(max(a, b) for a, b in zip(a_batch, b_batch))
-
-
-def _load_tree(path, index):
-    with Path(path).open("rb") as f:
-        payload = pickle.load(f)
-    trees = payload["trees"] if isinstance(payload, dict) else payload
-    if not isinstance(trees, (list, tuple)):
-        trees = [trees]
-    return trees[index]
-
-
-def _analyze_tree(tree):
-    contract_mod = importlib.import_module("cotengra.contract")
-    contractions = contract_mod.extract_contractions(tree)
-    size_dict = tree.size_dict
-    ops = []
-    counts = Counter()
-
-    for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions):
-        if left is None and right is None:
-            counts["preprocess"] += 1
-            continue
-
-        left_inds = tree.get_inds(left)
-        right_inds = tree.get_inds(right)
-        parent_inds = tree.get_inds(parent)
-        left_shape = tuple(size_dict[ix] for ix in left_inds)
-        right_shape = tuple(size_dict[ix] for ix in right_inds)
-
-        if tdot:
-            parsed = contract_mod._parse_tensordot_axes_to_matmul(
-                arg,
-                left_shape,
-                right_shape,
-            )
-        else:
-            parsed = contract_mod._parse_eq_to_batch_matmul(
-                arg,
-                left_shape,
-                right_shape,
-            )
-
-        (
-            _eq_a,
-            _eq_b,
-            new_shape_a,
-            new_shape_b,
-            _new_shape_ab,
-            _perm_ab,
-            pure_multiplication,
-        ) = parsed
-
-        matmul_shape = None
-        matmul_flops = 0
-        if pure_multiplication:
-            kind = "mul"
-        else:
-            a_shape = tuple(new_shape_a or left_shape)
-            b_shape = tuple(new_shape_b or right_shape)
-            batch = _broadcast_batch(a_shape[:-2], b_shape[:-2])
-            m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1])
-            kind = "mm" if batch == 1 else "bmm"
-            matmul_shape = (batch, m, k, n)
-            matmul_flops = batch * m * k * n
-
-        tree_flops = int(tree.get_flops(parent))
-        out_size = int(tree.get_size(parent))
-        ops.append(
-            {
-                "index": op_index,
-                "kind": kind,
-                "matmul_shape": matmul_shape,
-                "matmul_flops": matmul_flops,
-                "tree_flops": tree_flops,
-                "out_size": out_size,
-                "left_shape": left_shape,
-                "right_shape": right_shape,
-                "left_rank": len(left_inds),
-                "right_rank": len(right_inds),
-                "out_rank": len(parent_inds),
-                "perm": perm,
-            }
-        )
-        counts[kind] += 1
-
-    return contractions, ops, counts
-
-
-def _format_log(value, base):
-    return "-inf" if value <= 0 else f"{math.log(value, base):.3f}"
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("tree", help="Pickle file containing one tree or {'trees': [...]}.")
-    parser.add_argument("--index", type=int, default=0, help="Tree index in the file.")
-    parser.add_argument("--top", type=int, default=20, help="Number of top ops to print.")
-    parser.add_argument(
-        "--dtype-bytes",
-        type=int,
-        default=8,
-        help="Bytes per element for memory estimates, for example 8 for complex64.",
-    )
-    args = parser.parse_args()
-
-    tree = _load_tree(args.tree, args.index)
-    contractions, ops, counts = _analyze_tree(tree)
-    nslices = int(getattr(tree, "multiplicity", 1))
-    per_slice_flops = sum(op["tree_flops"] for op in ops)
-    per_slice_write = sum(op["out_size"] for op in ops)
-    max_out = max((op["out_size"] for op in ops), default=0)
-    all_flops = per_slice_flops * nslices
-    all_write = per_slice_write * nslices
-
-    print(f"tree={args.tree} index={args.index}")
-    print(
-        "summary "
-        f"slices={nslices} contractions={len(contractions)} "
-        f"counts={dict(counts)}"
-    )
-    print(
-        "per_slice "
-        f"log10_flops={_format_log(per_slice_flops, 10)} "
-        f"log10_write={_format_log(per_slice_write, 10)} "
-        f"log2_max_output={_format_log(max_out, 2)} "
-        f"max_output_gib={max_out * args.dtype_bytes / 1024**3:.6g}"
-    )
-    print(
-        "all_slices "
-        f"log10_flops={_format_log(all_flops, 10)} "
-        f"log10_write={_format_log(all_write, 10)}"
-    )
-
-    print(f"\ntop_{args.top}_ops_by_flops")
-    for op in sorted(ops, key=lambda item: item["tree_flops"], reverse=True)[: args.top]:
-        print(
-            f"op={op['index']} kind={op['kind']} "
-            f"flops={op['tree_flops']:.6e} out={op['out_size']:.6e} "
-            f"matmul={op['matmul_shape']} "
-            f"ranks=({op['left_rank']},{op['right_rank']}->{op['out_rank']}) "
-            f"lhs={op['left_shape']} rhs={op['right_shape']}"
-        )
-
-    by_shape = defaultdict(lambda: [0, 0, 0])
-    for op in ops:
-        shape = op["matmul_shape"]
-        if shape is None:
-            continue
-        by_shape[shape][0] += 1
-        by_shape[shape][1] += op["tree_flops"]
-        by_shape[shape][2] += op["out_size"]
-
-    print(f"\ntop_{args.top}_matmul_shapes_by_flops")
-    for shape, (count, flops, out_size) in sorted(
-        by_shape.items(),
-        key=lambda item: item[1][1],
-        reverse=True,
-    )[: args.top]:
-        print(
-            f"shape={shape} count={count} "
-            f"flops={flops:.6e} output={out_size:.6e}"
-        )
-
-    print(f"\ntop_{args.top}_matmul_shapes_by_count")
-    for shape, (count, flops, out_size) in sorted(
-        by_shape.items(),
-        key=lambda item: item[1][0],
-        reverse=True,
-    )[: args.top]:
-        print(
-            f"shape={shape} count={count} "
-            f"flops={flops:.6e} output={out_size:.6e}"
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/manage_tn_dask_cluster.sh b/tools/manage_tn_dask_cluster.sh
deleted file mode 100755
index 20c4e01..0000000
--- a/tools/manage_tn_dask_cluster.sh
+++ /dev/null
@@ -1,223 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Manage the dask cluster used by TN path search.
-#
-# Defaults target two servers:
-#   scheduler: 10.20.1.103:8786
-#   workers:   10.20.1.103, 10.20.6.101
-#
-# Usage:
-#   tools/manage_tn_dask_cluster.sh start
-#   tools/manage_tn_dask_cluster.sh status
-#   tools/manage_tn_dask_cluster.sh stop
-#
-# Common overrides:
-#   SCHEDULER_HOST=10.20.1.103
-#   WORKER_HOSTS="10.20.1.103 10.20.6.101"
-#   NWORKERS=48
-#   NTHREADS=1
-#   ROOT_DIR=/home/qibo/qibotn
-#   PYTHON_BIN=.venv/bin/python
-
-ROOT_DIR="${ROOT_DIR:-/home/qibo/qibotn}"
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
-SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
-DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
-WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.6.101}"
-NWORKERS="${NWORKERS:-84}"
-NTHREADS="${NTHREADS:-1}"
-MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
-LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
-LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}"
-SSH_BIN="${SSH_BIN:-ssh}"
-DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}"
-DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}"
-DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}"
-
-SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}"
-
-is_local_host() {
-  local host="$1"
-  [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
-  [[ "$host" == "$(hostname)" ]] && return 0
-  [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
-  hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
-}
-
-run_on_host() {
-  local host="$1"
-  shift
-  local cmd="$*"
-  if is_local_host "$host"; then
-    bash -lc "$cmd"
-  else
-    "$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")"
-  fi
-}
-
-start_scheduler() {
-  local host="$SCHEDULER_HOST"
-  local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log"
-  local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
-  run_on_host "$host" "
-    set -euo pipefail
-    cd '$ROOT_DIR'
-    mkdir -p '$LOG_DIR'
-    if [[ -s '$pid_file' ]]; then
-      pid=\$(cat '$pid_file')
-      if kill -0 \"\$pid\" 2>/dev/null; then
-        echo \"scheduler already running on $host pid=\$pid\"
-        exit 0
-      fi
-    fi
-    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
-    DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
-    DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
-    setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \
-      --host '$SCHEDULER_HOST' \
-      --port '$SCHEDULER_PORT' \
-      --dashboard-address '$DASHBOARD_ADDRESS' \
-      > '$log' 2>&1 < /dev/null &
-    pid=\$!
-    echo \"\$pid\" > '$pid_file'
-    echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\"
-  "
-}
-
-start_worker() {
-  local host="$1"
-  local log="$LOG_DIR/worker_${host}.log"
-  local pid_file="$LOG_DIR/worker_${host}.pid"
-  run_on_host "$host" "
-    set -euo pipefail
-    cd '$ROOT_DIR'
-    mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY'
-    if [[ -s '$pid_file' ]]; then
-      pid=\$(cat '$pid_file')
-      if kill -0 \"\$pid\" 2>/dev/null; then
-        echo \"worker already running on $host pid=\$pid\"
-        exit 0
-      fi
-    fi
-    TCM_ENABLE=1 \
-    DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
-    DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
-    DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
-    setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \
-      '$SCHEDULER_ADDR' \
-      --host '$host' \
-      --nworkers '$NWORKERS' \
-      --nthreads '$NTHREADS' \
-      --memory-limit '$MEMORY_LIMIT' \
-      --local-directory '$LOCAL_DIRECTORY' \
-      > '$log' 2>&1 < /dev/null &
-    pid=\$!
-    echo \"\$pid\" > '$pid_file'
-    echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\"
-  "
-}
-
-stop_host() {
-  local host="$1"
-  local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
-  local worker_pid_file="$LOG_DIR/worker_${host}.pid"
-  run_on_host "$host" "
-    set +e
-    for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
-      [[ -f \"\$pid_file\" ]] || continue
-      if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
-        continue
-      fi
-      pid=\$(cat \"\$pid_file\")
-      kill \"\$pid\" 2>/dev/null || true
-      rm -f \"\$pid_file\"
-    done
-    pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR'
-    pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT'
-    true
-  "
-}
-
-status_host() {
-  local host="$1"
-  local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
-  local worker_pid_file="$LOG_DIR/worker_${host}.pid"
-  echo "--------------------------------------------------------------------------------"
-  echo "host=$host"
-  run_on_host "$host" "
-    set +e
-    for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
-      [[ -f \"\$pid_file\" ]] || continue
-      if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
-        continue
-      fi
-      pid=\$(cat \"\$pid_file\")
-      if kill -0 \"\$pid\" 2>/dev/null; then
-        ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers
-      else
-        echo \"stale pid_file=\$pid_file pid=\$pid\"
-      fi
-    done
-    pgrep -af '[d]istributed.cli.dask' || true
-  "
-}
-
-case "${1:-help}" in
-  start)
-    start_scheduler
-    sleep 2
-    for host in $WORKER_HOSTS; do
-      start_worker "$host"
-    done
-    echo
-    echo "Dask scheduler: $SCHEDULER_ADDR"
-    echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS"
-    ;;
-  stop)
-    for host in $WORKER_HOSTS; do
-      stop_host "$host"
-    done
-    stop_host "$SCHEDULER_HOST"
-    ;;
-  status)
-    status_host "$SCHEDULER_HOST"
-    for host in $WORKER_HOSTS; do
-      [[ "$host" == "$SCHEDULER_HOST" ]] && continue
-      status_host "$host"
-    done
-    ;;
-  restart)
-    "$0" stop
-    sleep 2
-    "$0" start
-    ;;
-  help|*)
-    cat <<EOF
-Usage: tools/manage_tn_dask_cluster.sh [start|stop|restart|status]
-
-Defaults:
-  SCHEDULER_HOST=$SCHEDULER_HOST
-  SCHEDULER_PORT=$SCHEDULER_PORT
-  WORKER_HOSTS="$WORKER_HOSTS"
-  NWORKERS=$NWORKERS
-  NTHREADS=$NTHREADS
-  ROOT_DIR=$ROOT_DIR
-  PYTHON_BIN=$PYTHON_BIN
-  DASK_WORKER_TTL="$DASK_WORKER_TTL"
-  DASK_TICK_LIMIT=$DASK_TICK_LIMIT
-  DASK_LOST_WORKER_TIMEOUT=$DASK_LOST_WORKER_TIMEOUT
-
-Search command after start:
-  TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \\
-    --case main1 \\
-    --dask-address $SCHEDULER_ADDR \\
-    --torch-threads 48 \\
-    --dtype complex64 \\
-    --tn-search-repeats 2048 \\
-    --tn-search-time 300
-EOF
-    exit 2
-    ;;
-esac
diff --git a/tools/mpi_torch_thread_probe.py b/tools/mpi_torch_thread_probe.py
deleted file mode 100644
index 7b02104..0000000
--- a/tools/mpi_torch_thread_probe.py
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/usr/bin/env python
-"""Probe MPI rank placement and whether torch CPU ops use multiple threads.
-
-Run this under mpirun/mpiexec to check:
-
-* which CPUs each rank is allowed to run on,
-* whether torch sees the requested intra-op thread count, and
-* whether a large CPU tensor op actually consumes more CPU time than wall time.
-
-The script is intentionally small and self-contained so it can be used to debug
-MPI launcher affinity and torch OpenMP behavior independently from the TN code
-path.
-"""
-
-from __future__ import annotations
-
-import argparse
-import os
-import socket
-import time
-from pathlib import Path
-
-from mpi4py import MPI
-
-
-def _dtype_from_name(name):
-    import torch
-
-    mapping = {
-        "float32": torch.float32,
-        "float64": torch.float64,
-        "complex64": torch.complex64,
-        "complex128": torch.complex128,
-    }
-    return mapping[name]
-
-
-def _make_tensor(shape, dtype):
-    import torch
-
-    if dtype in (torch.complex64, torch.complex128):
-        base = torch.float32 if dtype == torch.complex64 else torch.float64
-        return torch.complex(
-            torch.randn(shape, dtype=base),
-            torch.randn(shape, dtype=base),
-        )
-    return torch.randn(shape, dtype=dtype)
-
-
-def _bench(label, fn, iters, warmup=2):
-    for _ in range(warmup):
-        fn()
-
-    start_wall = time.perf_counter()
-    start_cpu = time.process_time()
-    checksum = 0.0
-    for _ in range(iters):
-        value = fn()
-        checksum += float(value)
-    wall = time.perf_counter() - start_wall
-    cpu = time.process_time() - start_cpu
-    ratio = cpu / wall if wall > 0 else float("inf")
-    print(
-        f"{label} wall={wall:.3f}s cpu={cpu:.3f}s cpu_over_wall={ratio:.2f} "
-        f"checksum={checksum:.6e}",
-        flush=True,
-    )
-
-
-def _visible_numa_nodes():
-    nodes = []
-    for path in sorted(Path("/sys/devices/system/node").glob("node[0-9]*")):
-        cpulist = path / "cpulist"
-        if cpulist.exists():
-            nodes.append(f"{path.name}:{cpulist.read_text(encoding='utf-8').strip()}")
-    return ",".join(nodes) if nodes else "unknown"
-
-
-def _dtype_nbytes(name):
-    return {
-        "float32": 4,
-        "float64": 8,
-        "complex64": 8,
-        "complex128": 16,
-    }[name]
-
-
-def _format_gib(nbytes):
-    return f"{nbytes / (1024 ** 3):.2f}GiB"
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--threads", type=int, default=48)
-    parser.add_argument("--n", type=int, default=4096)
-    parser.add_argument("--iters", type=int, default=4)
-    parser.add_argument("--dtype", choices=("float32", "float64", "complex64", "complex128"), default="float32")
-    parser.add_argument("--op", choices=("matmul", "tensordot", "both"), default="both")
-    parser.add_argument(
-        "--affinity-only",
-        action="store_true",
-        help="Print MPI/torch placement diagnostics without allocating tensors.",
-    )
-    args = parser.parse_args()
-
-    os.environ.setdefault("OMP_NUM_THREADS", str(args.threads))
-    os.environ.setdefault("MKL_NUM_THREADS", str(args.threads))
-    os.environ.setdefault("OMP_PROC_BIND", "close")
-    os.environ.setdefault("OMP_PLACES", "cores")
-
-    import torch
-
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    torch.set_num_threads(args.threads)
-    try:
-        torch.set_num_interop_threads(1)
-    except Exception:
-        pass
-
-    dtype = _dtype_from_name(args.dtype)
-    affinity = sorted(os.sched_getaffinity(0))
-    allowed_list = ""
-    try:
-        with open("/proc/self/status", encoding="utf-8") as f:
-            for line in f:
-                if line.startswith("Cpus_allowed_list:"):
-                    allowed_list = line.split(":", 1)[1].strip()
-                    break
-    except OSError:
-        pass
-
-    print(
-        f"rank={rank}/{size} host={socket.gethostname()} pid={os.getpid()} "
-        f"affinity_len={len(affinity)} allowed={allowed_list} "
-        f"torch_threads={torch.get_num_threads()} "
-        f"torch_interop={torch.get_num_interop_threads()} "
-        f"OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')} "
-        f"MKL_NUM_THREADS={os.environ.get('MKL_NUM_THREADS')} "
-        f"OMP_PROC_BIND={os.environ.get('OMP_PROC_BIND')} "
-        f"OMP_PLACES={os.environ.get('OMP_PLACES')} "
-        f"visible_numa={_visible_numa_nodes()}",
-        flush=True,
-    )
-
-    if rank == 0:
-        print(torch.__config__.parallel_info(), flush=True)
-        input_bytes = args.n * args.n * _dtype_nbytes(args.dtype)
-        min_live_bytes = 3 * input_bytes
-        print(
-            f"matrix_n={args.n} dtype={args.dtype} "
-            f"one_matrix={_format_gib(input_bytes)} "
-            f"approx_min_live_per_rank={_format_gib(min_live_bytes)} "
-            f"approx_min_live_all_ranks={_format_gib(min_live_bytes * size)}",
-            flush=True,
-        )
-    comm.Barrier()
-    if args.affinity_only:
-        return
-
-    a = _make_tensor((args.n, args.n), dtype)
-    b = _make_tensor((args.n, args.n), dtype)
-
-    def run_matmul():
-        value = (a @ b).sum()
-        return value.real.item() if value.is_complex() else value.item()
-
-    def run_tensordot():
-        value = torch.tensordot(a, b, dims=1)
-        value = value.sum()
-        return value.real.item() if value.is_complex() else value.item()
-
-    if args.op in ("matmul", "both"):
-        _bench("matmul", run_matmul, args.iters)
-    if args.op in ("tensordot", "both"):
-        _bench("tensordot", run_tensordot, args.iters)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/mps_contest_runner.py b/tools/mps_contest_runner.py
deleted file mode 100644
index 353cc3e..0000000
--- a/tools/mps_contest_runner.py
+++ /dev/null
@@ -1,313 +0,0 @@
-#!/usr/bin/env python
-"""Contest-style multi-node Vidal/MPS expectation runner."""
-
-from __future__ import annotations
-
-import argparse
-import math
-import sys
-import time
-from dataclasses import dataclass
-from pathlib import Path
-
-import numpy as np
-from mpi4py import MPI
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Y, Z
-
-ROOT = Path(__file__).resolve().parents[1]
-SRC = ROOT / "src"
-if str(SRC) not in sys.path:
-    sys.path.insert(0, str(SRC))
-
-from qibotn.backends.vidal import VidalBackend  # noqa: E402
-from qibotn.expectation_runner import exact_for_observable  # noqa: E402
-
-
-@dataclass(frozen=True)
-class CaseSpec:
-    circuit_kind: str
-    observables: tuple[str, ...]
-    nqubits: int
-    nlayers: int
-    bond: int | None
-    seed: int
-
-
-CASES = {
-    "main1": CaseSpec(
-        circuit_kind="reversed_cnot",
-        observables=("ring_xz",),
-        nqubits=128,
-        nlayers=24,
-        bond=512,
-        seed=31001,
-    ),
-    "main2": CaseSpec(
-        circuit_kind="rxx_rzz",
-        observables=("open_zz", "range2_xx", "mixed_local"),
-        nqubits=128,
-        nlayers=32,
-        bond=1024,
-        seed=31002,
-    ),
-    "strong": CaseSpec(
-        circuit_kind="scramble",
-        observables=("ring_xz", "long_z_string", "dense3_spread"),
-        nqubits=256,
-        nlayers=48,
-        bond=2048,
-        seed=41001,
-    ),
-}
-
-
-def optional_int(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return int(text)
-
-
-def optional_float(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return float(text)
-
-
-def format_optional(value, fmt="g"):
-    return "None" if value is None else format(value, fmt)
-
-
-def set_torch_threads(nthreads):
-    try:
-        import torch
-
-        torch.set_num_threads(nthreads)
-    except Exception:
-        pass
-
-
-def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
-    for qubit in range(nqubits):
-        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        if include_rx:
-            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
-
-
-def build_circuit(kind, nqubits, nlayers, seed):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-
-    for layer in range(nlayers):
-        if kind == "reversed_cnot":
-            add_single_qubit_layer(circuit, nqubits, rng)
-            for qubit in range(0, nqubits - 1, 2):
-                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
-                circuit.add(gate)
-            for qubit in range(1, nqubits - 1, 2):
-                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
-                circuit.add(gate)
-
-        elif kind == "rxx_rzz":
-            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
-            for qubit in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
-                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
-
-        elif kind == "scramble":
-            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
-            for qubit in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
-                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
-                if layer % 5 == 4:
-                    circuit.add(gates.SWAP(qubit, qubit + 1))
-
-        else:
-            raise ValueError(f"Unknown circuit kind {kind!r}.")
-
-    return circuit
-
-
-def dense_observable(nqubits, qubits, seed, dim):
-    del nqubits
-    rng = np.random.default_rng(seed)
-    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
-    matrix = (raw + raw.conj().T) / 2.0
-    matrix = matrix / np.linalg.norm(matrix)
-    return {"matrix": matrix, "qubits": list(qubits)}
-
-
-def observable(kind, nqubits, seed):
-    q1 = nqubits // 4
-    q2 = nqubits // 2
-    q3 = (3 * nqubits) // 4
-    last = nqubits - 1
-
-    if kind == "boundary_ZZ_q1":
-        return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
-    if kind == "boundary_ZZ_q2":
-        return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
-    if kind == "boundary_ZZ_q3":
-        return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
-    if kind == "long_Z_5_sites":
-        return hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last))
-    if kind == "mixed_XZYZX":
-        return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
-    if kind == "ring_xz":
-        form = 0
-        for qubit in range(nqubits):
-            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-    if kind == "open_zz":
-        form = 0
-        for qubit in range(nqubits - 1):
-            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-    if kind == "range2_xx":
-        form = 0
-        for qubit in range(nqubits - 2):
-            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-    if kind == "mixed_local":
-        form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-    if kind == "complex_iZ0":
-        return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
-    if kind == "dense2_mid":
-        return dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
-    if kind == "dense3_spread":
-        return dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
-    raise ValueError(f"Unknown observable kind {kind!r}.")
-
-
-def selected_observables(args, case):
-    if args.observables:
-        return tuple(args.observables)
-    if args.obs_filter:
-        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
-    return case.observables
-
-
-def apply_case_defaults(args):
-    case = CASES[args.case]
-    if args.nqubits is None:
-        args.nqubits = case.nqubits
-    if args.nlayers is None:
-        args.nlayers = case.nlayers
-    if args.bond == "case-default":
-        args.bond = case.bond
-    if args.seed is None:
-        args.seed = case.seed
-    args.observables = selected_observables(args, case)
-
-
-def run_case(args):
-    set_torch_threads(args.torch_threads)
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    case = CASES[args.case]
-    circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
-
-    if rank == 0:
-        print("=" * 88, flush=True)
-        print(
-            "backend=vidal_mps "
-            f"case={args.case} circuit={case.circuit_kind} ranks={size} "
-            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
-            f"bond={format_optional(args.bond)} cut_ratio={format_optional(args.cut_ratio)} "
-            f"torch_threads={args.torch_threads} seed={args.seed} "
-            f"observables={','.join(args.observables)}",
-            flush=True,
-        )
-        print("observable exact value abs_error rel_error seconds trunc_sum trunc_max status", flush=True)
-
-    for obs_name in args.observables:
-        obs = observable(obs_name, args.nqubits, args.seed)
-        exact = None
-        if args.exact and rank == 0:
-            if args.nqubits > args.exact_max_qubits:
-                raise ValueError(
-                    f"--exact is limited to {args.exact_max_qubits} qubits by default."
-                )
-            exact = exact_for_observable(circuit, obs, args.nqubits)
-
-        backend = VidalBackend()
-        backend.configure_tn_simulation(
-            max_bond_dimension=args.bond,
-            cut_ratio=args.cut_ratio,
-            tensor_module="torch",
-            mpi_approach="CT",
-            mpi_num_procs=size,
-            fallback=False,
-        )
-
-        comm.Barrier()
-        start = time.perf_counter()
-        try:
-            value = backend.expectation(
-                circuit,
-                obs,
-                preprocess=True,
-                compile_circuit=False,
-            )
-            status = "ok"
-        except Exception as exc:
-            value = np.nan
-            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
-        seconds = time.perf_counter() - start
-
-        if rank == 0:
-            abs_error = float("nan") if exact is None else abs(value - exact)
-            rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
-            exact_text = "nan" if exact is None else f"{exact:.16e}"
-            print(
-                f"{obs_name} {exact_text} {value!r} "
-                f"{abs_error:.6e} {rel_error:.6e} {seconds:.3f} "
-                f"{backend.last_truncation_error:.6e} "
-                f"{backend.last_max_truncation_error:.6e} {status}",
-                flush=True,
-            )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("mode", choices=("run", "validate", "list"))
-    parser.add_argument("--case", choices=sorted(CASES), default="main1")
-    parser.add_argument("--observables", nargs="+")
-    parser.add_argument("--obs-filter", default="")
-    parser.add_argument("--nqubits", type=int)
-    parser.add_argument("--nlayers", type=int)
-    parser.add_argument("--bond", "--bonds", dest="bond", default="case-default")
-    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
-    parser.add_argument("--seed", type=int)
-    parser.add_argument("--torch-threads", type=int, default=8)
-    parser.add_argument("--exact", action="store_true")
-    parser.add_argument("--exact-max-qubits", type=int, default=24)
-    args = parser.parse_args()
-
-    if args.mode == "list":
-        for name, case in CASES.items():
-            print(
-                f"{name}: circuit={case.circuit_kind} "
-                f"observables={','.join(case.observables)} "
-                f"nqubits={case.nqubits} nlayers={case.nlayers} "
-                f"bond={case.bond} seed={case.seed}"
-            )
-        return
-
-    apply_case_defaults(args)
-    if isinstance(args.bond, str):
-        args.bond = optional_int(args.bond)
-
-    if args.mode == "validate":
-        args.exact = True
-        args.nqubits = min(args.nqubits, args.exact_max_qubits)
-
-    run_case(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/profile_vidal_chrome.py b/tools/profile_vidal_chrome.py
deleted file mode 100644
index bf22276..0000000
--- a/tools/profile_vidal_chrome.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""Chrome trace profiler for the VidalBackend fast path."""
-
-from __future__ import annotations
-
-import argparse
-from pathlib import Path
-
-import torch
-from torch.profiler import ProfilerActivity, profile
-
-from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms
-from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=34)
-    parser.add_argument("--nlayers", type=int, default=20)
-    parser.add_argument("--bond", type=int, default=512)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--torch-threads", type=int, default=32)
-    parser.add_argument("--cut-ratio", type=float, default=1e-12)
-    parser.add_argument("--profile-memory", action="store_true")
-    parser.add_argument("--rows", type=int, default=60)
-    args = parser.parse_args()
-
-    torch.set_num_threads(args.torch_threads)
-
-    prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}"
-    trace_path = Path(f"{prefix}.json")
-    table_path = Path(f"{prefix}.txt")
-    trace_path.parent.mkdir(parents=True, exist_ok=True)
-
-    circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed)
-    observable = terms_to_dict(observable_terms("ring_xz", args.nqubits))
-    config = ExpectationConfig(
-        ansatz="mps",
-        bond=args.bond,
-        cut_ratio=args.cut_ratio,
-        tensor_module="torch",
-        torch_threads=args.torch_threads,
-    )
-
-    print(
-        f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} "
-        f"bond={args.bond} threads={args.torch_threads}"
-    )
-
-    with profile(
-        activities=[ProfilerActivity.CPU],
-        record_shapes=args.profile_memory,
-        profile_memory=args.profile_memory,
-        with_stack=args.profile_memory,
-    ) as prof:
-        result = run_cpu_expectation(circuit, observable, config)
-
-    table = (
-        f"expval={result.value:.16e}\n\n"
-        f"# sorted by self_cpu_time_total\n"
-        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n"
-        f"# sorted by cpu_time_total\n"
-        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n"
-    )
-
-    print(table, end="")
-    table_path.write_text(table, encoding="utf-8")
-    prof.export_chrome_trace(str(trace_path))
-    print(f"trace={trace_path}\ntable={table_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/qibojit_reference_expectation.py b/tools/qibojit_reference_expectation.py
deleted file mode 100644
index 429855a..0000000
--- a/tools/qibojit_reference_expectation.py
+++ /dev/null
@@ -1,109 +0,0 @@
-"""Compute and cache a qibojit state-vector reference for the ring-XZ observable."""
-
-import argparse
-import json
-import math
-import time
-from pathlib import Path
-
-import numpy as np
-import qibo
-from qibo import Circuit, gates
-
-
-def build_circuit(nqubits, nlayers, seed):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-    for _ in range(nlayers):
-        for qubit in range(nqubits):
-            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        for qubit in range(0, nqubits - 1, 2):
-            circuit.add(gates.CNOT(qubit, qubit + 1))
-        for qubit in range(1, nqubits - 1, 2):
-            circuit.add(gates.CNOT(qubit, qubit + 1))
-    return circuit
-
-
-def ring_xz_expectation(state, nqubits, chunk_size):
-    value = 0.0
-    for qubit in range(nqubits):
-        next_qubit = (qubit + 1) % nqubits
-        x_flip = 1 << (nqubits - 1 - qubit)
-        z_shift = nqubits - 1 - next_qubit
-        term = 0.0
-        for start in range(0, state.size, chunk_size):
-            stop = min(start + chunk_size, state.size)
-            indices = np.arange(start, stop, dtype=np.int64)
-            z_bit = (indices >> z_shift) & 1
-            z_phase = 1 - 2 * z_bit
-            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
-        value += 0.5 * term
-    return float(value)
-
-
-def default_output_path(nqubits, nlayers, seed):
-    return Path("references") / (
-        f"qibojit_ring_xz_n{nqubits}_l{nlayers}_seed{seed}.json"
-    )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=32)
-    parser.add_argument("--nlayers", type=int, default=3)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--output")
-    parser.add_argument("--force", action="store_true")
-    parser.add_argument("--allow-large", action="store_true")
-    parser.add_argument("--max-state-gb", type=float, default=32.0)
-    parser.add_argument("--chunk-size", type=int, default=1 << 20)
-    args = parser.parse_args()
-
-    output = Path(args.output) if args.output else default_output_path(
-        args.nqubits, args.nlayers, args.seed
-    )
-    if output.exists() and not args.force:
-        with open(output, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        print(f"loaded {output}")
-        print(f"expectation={float(data['expectation']):.16e}")
-        return
-
-    state_gb = (2**args.nqubits) * np.dtype(np.complex128).itemsize / (1024**3)
-    if state_gb > args.max_state_gb and not args.allow_large:
-        raise MemoryError(
-            f"Estimated state vector alone is {state_gb:.1f} GiB. "
-            "Pass --allow-large after confirming the node has enough memory."
-        )
-
-    qibo.set_backend("qibojit")
-    circuit = build_circuit(args.nqubits, args.nlayers, args.seed)
-
-    start = time.perf_counter()
-    state = circuit().state(numpy=True).reshape(-1)
-    expectation = ring_xz_expectation(state, args.nqubits, args.chunk_size)
-    elapsed = time.perf_counter() - start
-
-    data = {
-        "backend": "qibojit",
-        "observable": "0.5 * sum_i X_i Z_((i+1) mod n)",
-        "nqubits": args.nqubits,
-        "nlayers": args.nlayers,
-        "seed": args.seed,
-        "expectation": expectation,
-        "seconds": elapsed,
-        "state_vector_gib_estimate": state_gb,
-    }
-    output.parent.mkdir(parents=True, exist_ok=True)
-    with open(output, "w", encoding="utf-8") as f:
-        json.dump(data, f, indent=2, sort_keys=True)
-        f.write("\n")
-
-    print(f"saved {output}")
-    print(f"expectation={expectation:.16e}")
-    print(f"seconds={elapsed:.3f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/qibotn_torch_mt_env.sh b/tools/qibotn_torch_mt_env.sh
deleted file mode 100644
index 838cdef..0000000
--- a/tools/qibotn_torch_mt_env.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-# Shared runtime setup for CPU torch TN/MPS runs.
-#
-# This makes AOCL BLIS use the multithreaded library when available, which is
-# required for complex64 tensordot/cgemm to actually use all cores on this host.
-
-QIBOTN_BLIS_MT="${QIBOTN_BLIS_MT:-/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5}"
-
-export BLIS_NUM_THREADS="${BLIS_NUM_THREADS:-${OMP_NUM_THREADS:-1}}"
-
-if [[ -f "$QIBOTN_BLIS_MT" ]]; then
-  case ":${LD_PRELOAD:-}:" in
-    *":$QIBOTN_BLIS_MT:"*)
-      ;;
-    *)
-      export LD_PRELOAD="${LD_PRELOAD:+$LD_PRELOAD:}$QIBOTN_BLIS_MT"
-      ;;
-  esac
-fi
-
-export OMP_PROC_BIND="${OMP_PROC_BIND:-close}"
-export OMP_PLACES="${OMP_PLACES:-cores}"
diff --git a/tools/run_cpu_large_cases.sh b/tools/run_cpu_large_cases.sh
deleted file mode 100755
index 59be311..0000000
--- a/tools/run_cpu_large_cases.sh
+++ /dev/null
@@ -1,128 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Large CPU expectation benchmarks for two-server runs.
-#
-# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM
-# each.  Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as
-# needed.
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$ROOT_DIR"
-
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-MPIEXEC="${MPIEXEC:-mpiexec}"
-HOSTFILE="${HOSTFILE:-hostfile}"
-
-MPS_RANKS="${MPS_RANKS:-8}"
-MPS_THREADS="${MPS_THREADS:-12}"
-TN_RANKS="${TN_RANKS:-12}"
-TN_THREADS="${TN_THREADS:-8}"
-
-export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
-export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
-source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
-
-run_mpi() {
-  local ranks="$1"
-  shift
-  "$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@"
-}
-
-run_case() {
-  local title="$1"
-  shift
-  echo
-  echo "================================================================================"
-  echo "$title"
-  echo "================================================================================"
-  echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
-  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
-  echo "$*"
-  "$@"
-}
-
-case "${1:-help}" in
-  smoke)
-    run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \
-      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
-        --mpi --mps \
-        --nqubits "${MPS_SMOKE_NQ:-40}" \
-        --nlayers "${MPS_SMOKE_LAYERS:-30}" \
-        --bond "${MPS_SMOKE_BOND:-2048}" \
-        --torch-threads "$MPS_THREADS" \
-        --circuits brickwall_cnot reversed_cnot shifted_cz \
-        --observables ring_xz open_zz range2_xx
-
-    run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \
-      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
-        --mpi \
-        --nqubits "${TN_SMOKE_NQ:-32}" \
-        --nlayers "${TN_SMOKE_LAYERS:-16}" \
-        --torch-threads "$TN_THREADS" \
-        --circuits brickwall_cnot shifted_cz rxx_rzz \
-        --observables ring_xz open_zz range2_xx \
-        --tn-target-slices "${TN_SMOKE_SLICES:-12}"
-    ;;
-
-  mps-long)
-    run_case "MPS MPI long: n=64 layers=48 bond=4096" \
-      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
-        --mpi --mps \
-        --nqubits "${MPS_LONG_NQ:-64}" \
-        --nlayers "${MPS_LONG_LAYERS:-48}" \
-        --bond "${MPS_LONG_BOND:-4096}" \
-        --torch-threads "$MPS_THREADS" \
-        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
-        --observables ring_xz open_zz mixed_local range2_xx
-    ;;
-
-  mps-pressure)
-    run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \
-      run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \
-        --mpi --mps \
-        --nqubits "${MPS_PRESSURE_NQ:-80}" \
-        --nlayers "${MPS_PRESSURE_LAYERS:-64}" \
-        --bond "${MPS_PRESSURE_BOND:-4096}" \
-        --torch-threads "$MPS_THREADS" \
-        --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \
-        --observables ring_xz open_zz mixed_local range2_xx long_z_string
-    ;;
-
-  tn-long)
-    run_case "TN MPI long: n=36 layers=20 target_slices=24" \
-      run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \
-        --mpi \
-        --nqubits "${TN_LONG_NQ:-36}" \
-        --nlayers "${TN_LONG_LAYERS:-20}" \
-        --torch-threads "$TN_THREADS" \
-        --circuits brickwall_cnot shifted_cz rxx_rzz \
-        --observables ring_xz open_zz range2_xx \
-        --tn-target-slices "${TN_LONG_SLICES:-24}"
-    ;;
-
-  all)
-    "$0" smoke
-    "$0" mps-long
-    "$0" tn-long
-    ;;
-
-  help|*)
-    cat >&2 <<'EOF'
-Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all]
-
-Common overrides:
-  HOSTFILE=hostfile
-  PYTHON_BIN=.venv/bin/python
-  MPIEXEC=mpiexec
-  MPS_RANKS=8 MPS_THREADS=12
-  TN_RANKS=12 TN_THREADS=8
-
-Scale overrides:
-  MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096
-  MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096
-  TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24
-EOF
-    exit 2
-    ;;
-esac
diff --git a/tools/run_cpu_single_cases.sh b/tools/run_cpu_single_cases.sh
deleted file mode 100755
index b7f23e7..0000000
--- a/tools/run_cpu_single_cases.sh
+++ /dev/null
@@ -1,149 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Single-node CPU scale probes for expectation benchmarks.
-#
-# Intended for one 96-core / ~500 GiB RAM node.  The default "probe" mode runs
-# moderate MPS and TN cases first.  Larger modes are available after checking
-# runtime and memory from the probe output.
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$ROOT_DIR"
-
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-PYTHON_FLAGS="${PYTHON_FLAGS:--u}"
-MPIEXEC="${MPIEXEC:-mpiexec}"
-TIME_BIN="${TIME_BIN:-/usr/bin/time}"
-
-MPS_RANKS="${MPS_RANKS:-8}"
-MPS_THREADS="${MPS_THREADS:-12}"
-TN_RANKS="${TN_RANKS:-8}"
-TN_THREADS="${TN_THREADS:-12}"
-
-export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
-export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
-source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
-
-estimate_mps_memory() {
-  local nqubits="$1"
-  local bond="$2"
-  "$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY'
-import sys
-n = int(sys.argv[1])
-chi = int(sys.argv[2])
-ranks = int(sys.argv[3])
-resident = n * 2 * chi * chi * 16
-per_rank = resident / ranks
-print(
-    "MPS rough resident memory: "
-    f"total={resident / 1024**3:.1f} GiB "
-    f"per_rank={per_rank / 1024**3:.1f} GiB "
-    "(temporary eig/SVD workspaces are additional)"
-)
-PY
-}
-
-run_timed() {
-  echo
-  echo "--------------------------------------------------------------------------------"
-  echo "$*"
-  echo "--------------------------------------------------------------------------------"
-  "$TIME_BIN" -v "$@"
-}
-
-run_mps_case() {
-  local label="$1"
-  local nqubits="$2"
-  local nlayers="$3"
-  local bond="$4"
-  shift 4
-  echo
-  echo "================================================================================"
-  echo "$label"
-  echo "================================================================================"
-  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
-  echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS"
-  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
-  estimate_mps_memory "$nqubits" "$bond"
-  run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
-    --mpi --mps \
-    --nqubits "$nqubits" \
-    --nlayers "$nlayers" \
-    --bond "$bond" \
-    --torch-threads "$MPS_THREADS" \
-    "$@"
-}
-
-run_tn_case() {
-  local label="$1"
-  local nqubits="$2"
-  local nlayers="$3"
-  shift 3
-  echo
-  echo "================================================================================"
-  echo "$label"
-  echo "================================================================================"
-  echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC"
-  echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS"
-  echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS"
-  echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high."
-  run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \
-    --mpi \
-    --nqubits "$nqubits" \
-    --nlayers "$nlayers" \
-    --torch-threads "$TN_THREADS" \
-    "$@"
-}
-
-case "${1:-help}" in
-  probe)
-    run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \
-      --circuits brickwall_cnot \
-      --observables ring_xz
-
-    run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \
-      --circuits brickwall_cnot \
-      --observables ring_xz \
-      --tn-target-slices 8
-    ;;
-
-  mps-medium)
-    run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \
-      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz mixed_local range2_xx
-    ;;
-
-  mps-long)
-    run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \
-      --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz mixed_local range2_xx
-    ;;
-
-  tn-medium)
-    run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \
-      --circuits brickwall_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz range2_xx \
-      --tn-target-slices 16
-    ;;
-
-  tn-long)
-    run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \
-      --circuits brickwall_cnot shifted_cz rxx_rzz \
-      --observables ring_xz open_zz range2_xx \
-      --tn-target-slices 32
-    ;;
-
-  help|*)
-    cat >&2 <<'EOF'
-Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long]
-
-Common overrides:
-  PYTHON_BIN=.venv/bin/python
-  MPIEXEC=mpiexec
-  MPS_RANKS=8 MPS_THREADS=12
-  TN_RANKS=8 TN_THREADS=12
-  OMP_NUM_THREADS=1 MKL_NUM_THREADS=1
-EOF
-    exit 2
-    ;;
-esac
diff --git a/tools/run_tn_custom.py b/tools/run_tn_custom.py
deleted file mode 100644
index 049ebed..0000000
--- a/tools/run_tn_custom.py
+++ /dev/null
@@ -1,243 +0,0 @@
-#!/usr/bin/env python
-"""Run TN expectation for a user-provided circuit and observable.
-
-The case module should define:
-
-    def build_circuit(nqubits, nlayers, seed): ...
-    def build_observable(nqubits, seed): ...
-
-``build_observable`` may return a Qibo SymbolicHamiltonian/form or the qibotn
-dict form:
-
-    {"terms": [
-        {"coefficient": 1.0, "operators": [("X", 0), ("Z", 1)]},
-    ]}
-
-For a single repeated Pauli string, pass ``--pauli-pattern`` instead of
-defining ``build_observable``.
-"""
-
-from __future__ import annotations
-
-import argparse
-import importlib.util
-import inspect
-import json
-import sys
-from pathlib import Path
-
-ROOT = Path(__file__).resolve().parents[1]
-SRC = ROOT / "src"
-if str(SRC) not in sys.path:
-    sys.path.insert(0, str(SRC))
-
-from qibotn.expectation_runner import (  # noqa: E402
-    ExpectationConfig,
-    exact_for_observable,
-    run_cpu_expectation,
-)
-
-
-def optional_int(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return int(text)
-
-
-def optional_float(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return float(text)
-
-
-def load_module(path):
-    path = Path(path).resolve()
-    spec = importlib.util.spec_from_file_location(path.stem, path)
-    if spec is None or spec.loader is None:
-        raise RuntimeError(f"Cannot import case module from {path}.")
-    module = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(module)
-    return module
-
-
-def call_builder(fn, **kwargs):
-    sig = inspect.signature(fn)
-    if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
-        return fn(**kwargs)
-    accepted = {
-        name: value
-        for name, value in kwargs.items()
-        if name in sig.parameters
-    }
-    return fn(**accepted)
-
-
-def load_observable(args, module):
-    if args.pauli_pattern:
-        return {"pauli_string_pattern": args.pauli_pattern}
-    if args.observable_json:
-        with Path(args.observable_json).open() as f:
-            return json.load(f)
-    if hasattr(module, "build_observable"):
-        return call_builder(
-            module.build_observable,
-            nqubits=args.nqubits,
-            nlayers=args.nlayers,
-            seed=args.seed,
-        )
-    if hasattr(module, "OBSERVABLE"):
-        return module.OBSERVABLE
-    raise ValueError(
-        "No observable supplied. Define build_observable/OBSERVABLE in the case "
-        "module, or pass --pauli-pattern / --observable-json."
-    )
-
-
-def build_parallel_opts(args):
-    slicing_opts = {}
-    if args.tn_target_slices is not None:
-        slicing_opts["target_slices"] = args.tn_target_slices
-    if args.tn_target_size is not None:
-        slicing_opts["target_size"] = args.tn_target_size
-
-    opts = {
-        "slicing_opts": slicing_opts or None,
-        "search_workers": args.tn_search_workers or args.torch_threads,
-        "max_repeats": args.tn_search_repeats,
-        "max_time": args.tn_search_time,
-        "print_stats": not args.no_tn_stats,
-    }
-    if args.tn_search_backend is not None:
-        opts["search_backend"] = args.tn_search_backend
-    if args.dask_address is not None:
-        opts["dask_address"] = args.dask_address
-    if args.dask_close_workers:
-        opts["dask_close_workers"] = True
-    if args.tn_save_tree is not None:
-        opts["save_tree_path"] = args.tn_save_tree
-    if args.tn_load_tree is not None:
-        opts["load_tree_path"] = args.tn_load_tree
-    if args.tn_search_only:
-        opts["search_only"] = True
-    return opts
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Run CPU TN expectation for a custom qibo circuit module."
-    )
-    parser.add_argument("case_module", help="Python file defining build_circuit.")
-    parser.add_argument("--nqubits", type=int, required=True)
-    parser.add_argument("--nlayers", type=int, default=0)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--mpi", action="store_true")
-    parser.add_argument("--exact", action="store_true")
-    parser.add_argument("--exact-max-qubits", type=int, default=24)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
-    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
-    parser.add_argument("--torch-threads", type=int, default=8)
-    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
-    parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex128")
-    parser.add_argument("--pauli-pattern")
-    parser.add_argument("--observable-json")
-    parser.add_argument("--tn-target-slices", type=int)
-    parser.add_argument("--tn-target-size", type=int, default=2**32)
-    parser.add_argument("--tn-search-workers", type=int)
-    parser.add_argument("--tn-search-repeats", type=int, default=128)
-    parser.add_argument("--tn-search-time", type=float, default=60.0)
-    parser.add_argument("--tn-search-backend", choices=("processpool", "dask"))
-    parser.add_argument("--dask-address")
-    parser.add_argument("--dask-close-workers", action="store_true")
-    parser.add_argument("--tn-save-tree")
-    parser.add_argument("--tn-load-tree")
-    parser.add_argument("--tn-search-only", action="store_true")
-    parser.add_argument("--no-tn-stats", action="store_true")
-    args = parser.parse_args()
-
-    rank = 0
-    if args.mpi:
-        from mpi4py import MPI
-
-        rank = MPI.COMM_WORLD.Get_rank()
-
-    module = load_module(args.case_module)
-    if not hasattr(module, "build_circuit"):
-        raise ValueError("case_module must define build_circuit.")
-
-    circuit = call_builder(
-        module.build_circuit,
-        nqubits=args.nqubits,
-        nlayers=args.nlayers,
-        seed=args.seed,
-    )
-    observable = load_observable(args, module)
-
-    config = ExpectationConfig(
-        ansatz="tn",
-        mpi=args.mpi,
-        bond=args.bond,
-        cut_ratio=args.cut_ratio,
-        tensor_module="torch",
-        quimb_backend=args.quimb_backend,
-        dtype=args.dtype,
-        torch_threads=args.torch_threads,
-        parallel_opts=build_parallel_opts(args),
-    )
-
-    if rank == 0:
-        mode = "MPI" if args.mpi else "serial"
-        print(
-            f"backend=cpu ansatz=TN mode={mode} case={Path(args.case_module).name} "
-            f"nqubits={args.nqubits} nlayers={args.nlayers} seed={args.seed} "
-            f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
-            f"torch_threads={args.torch_threads}",
-            flush=True,
-        )
-        print("observable exact value abs_error rel_error seconds", flush=True)
-
-    exact = None
-    if args.exact and rank == 0:
-        if args.nqubits > args.exact_max_qubits:
-            raise ValueError(
-                f"--exact is limited to {args.exact_max_qubits} qubits by default."
-            )
-        exact = exact_for_observable(circuit, observable, args.nqubits)
-
-    result = run_cpu_expectation(circuit, observable, config)
-    if args.mpi and result.rank != 0:
-        return
-
-    abs_error = float("nan") if exact is None else abs(result.value - exact)
-    rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
-    exact_text = "nan" if exact is None else f"{exact:.16e}"
-    print(
-        f"custom {exact_text} {result.value:.16e} "
-        f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}",
-        flush=True,
-    )
-
-    for stat in result.parallel_stats or ():
-        cost = stat["path_cost"]
-        search_stats = stat.get("search_stats", {})
-        print(
-            "tn_term_summary "
-            f"term={stat.get('term_index', 0)} "
-            f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
-            f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
-            f"completed_trials={search_stats.get('completed_trials', 'na')} "
-            f"finite_trials={search_stats.get('finite_trials', 'na')} "
-            f"failed_trials={search_stats.get('failed_trials', 'na')} "
-            f"requested_trials={search_stats.get('requested_trials', 'na')} "
-            f"best_score={search_stats.get('best_score', float('nan')):.6g} "
-            f"slices={cost.get('slices')} "
-            f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
-            f"log10_write={cost.get('log10_write', float('nan')):.3f} "
-            f"log2_size={cost.get('log2_size', float('nan')):.3f} "
-            f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
-            f"rank_slices={stat.get('rank_slices')}",
-            flush=True,
-        )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/run_tn_dask_mpi_all.sh b/tools/run_tn_dask_mpi_all.sh
deleted file mode 100755
index b4ba0d1..0000000
--- a/tools/run_tn_dask_mpi_all.sh
+++ /dev/null
@@ -1,260 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$ROOT_DIR"
-
-CASE="${CASE:-main1}"
-OBSERVABLES="${OBSERVABLES:-long_z_string}"
-NQUBITS="${NQUBITS:-34}"
-NLAYERS="${NLAYERS:-20}"
-TORCH_THREADS="${TORCH_THREADS:-48}"
-SEARCH_REPEATS="${SEARCH_REPEATS:-2048}"
-SEARCH_TIME="${SEARCH_TIME:-300}"
-TN_TARGET_SIZE="${TN_TARGET_SIZE:-17179869184}"
-TN_TARGET_SLICES="${TN_TARGET_SLICES:-}"
-
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-DTYPE="${DTYPE:-complex64}"
-TREE_DIR="${TREE_DIR:-trees/contest_tn}"
-DASK_ADDRESS="${DASK_ADDRESS:-tcp://10.20.1.103:8786}"
-DASK_EXPECTED_WORKERS="${DASK_EXPECTED_WORKERS:-}"
-DASK_WAIT_FOR_WORKERS="${DASK_WAIT_FOR_WORKERS:-1}"
-DASK_WAIT_TIMEOUT="${DASK_WAIT_TIMEOUT:-600}"
-TN_DEBUG_TRIALS="${TN_DEBUG_TRIALS:-0}"
-MPIEXEC="${MPIEXEC:-mpirun}"
-MPIEXEC_FULL="${MPIEXEC_FULL:-}"
-MPI_HOSTS="${MPI_HOSTS:-}"
-MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
-MPI_RANKS="${MPI_RANKS:-}"
-MPI_PE="${MPI_PE:-$TORCH_THREADS}"
-MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
-MPI_BIND_TO="${MPI_BIND_TO:-core}"
-MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
-MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
-TN_CONTRACT_ENV_CHECK="${TN_CONTRACT_ENV_CHECK:-1}"
-SYNC_TREES="${SYNC_TREES:-1}"
-SYNC_HOSTS="${SYNC_HOSTS:-${WORKER_HOSTS:-}}"
-SSH_BIN="${SSH_BIN:-ssh}"
-DASK_CLUSTER_MANAGED="${DASK_CLUSTER_MANAGED:-0}"
-
-export TCM_ENABLE="${TCM_ENABLE:-1}"
-export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
-export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
-source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
-
-tn_slice_args=(--tn-target-size "$TN_TARGET_SIZE")
-if [[ -n "$TN_TARGET_SLICES" ]]; then
-  tn_slice_args+=(--tn-target-slices "$TN_TARGET_SLICES")
-fi
-
-cleanup_dask_cluster() {
-  local status=$?
-  if [[ "$DASK_CLUSTER_MANAGED" == "1" ]]; then
-    set +e
-    tools/manage_tn_dask_cluster.sh stop >/dev/null 2>&1 || true
-  fi
-  exit "$status"
-}
-
-trap cleanup_dask_cluster EXIT INT TERM HUP
-
-sum_host_slots() {
-  local hosts="$1"
-  local total=0
-  local item slots
-  IFS=',' read -r -a host_items <<< "$hosts"
-  for item in "${host_items[@]}"; do
-    if [[ "$item" == *:* ]]; then
-      slots="${item##*:}"
-    else
-      slots=1
-    fi
-    total=$((total + slots))
-  done
-  echo "$total"
-}
-
-count_hosts() {
-  local hosts="$1"
-  local count=0
-  local item
-  IFS=' ' read -r -a host_items <<< "$hosts"
-  for item in "${host_items[@]}"; do
-    [[ -n "$item" ]] && count=$((count + 1))
-  done
-  echo "$count"
-}
-
-wait_for_dask_workers() {
-  [[ "$DASK_WAIT_FOR_WORKERS" == "1" ]] || return 0
-  local expected="$DASK_EXPECTED_WORKERS"
-  if [[ -z "$expected" && -n "$WORKER_HOSTS" ]]; then
-    expected=$(( $(count_hosts "$WORKER_HOSTS") * NWORKERS ))
-  fi
-  if [[ -z "$expected" || "$expected" -le 0 ]]; then
-    return 0
-  fi
-
-  echo "Waiting for Dask workers: expected=$expected timeout=${DASK_WAIT_TIMEOUT}s"
-  "$PYTHON_BIN" - "$DASK_ADDRESS" "$expected" "$DASK_WAIT_TIMEOUT" <<'PY'
-import sys
-import time
-from distributed import Client
-
-address, expected, timeout = sys.argv[1], int(sys.argv[2]), int(sys.argv[3])
-deadline = time.time() + timeout
-client = Client(address)
-try:
-    while True:
-        info = client.scheduler_info(n_workers=-1)
-        workers = info.get("workers", {})
-        count = len(workers)
-        if count >= expected:
-            print(f"dask_workers_ready count={count} expected={expected}", flush=True)
-            break
-        if time.time() >= deadline:
-            print(
-                f"dask_workers_wait_timeout count={count} expected={expected}",
-                flush=True,
-            )
-            break
-        time.sleep(2)
-finally:
-    client.close()
-PY
-}
-
-append_mpi_env_args() {
-  [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
-  mpi_prefix+=(
-    -x "LD_PRELOAD=${LD_PRELOAD:-}"
-    -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
-    -x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
-    -x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
-    -x "OMP_PROC_BIND=$OMP_PROC_BIND"
-    -x "OMP_PLACES=$OMP_PLACES"
-  )
-}
-
-build_mpi_prefix() {
-  if [[ -n "$MPIEXEC_FULL" ]]; then
-    # shellcheck disable=SC2206
-    mpi_prefix=($MPIEXEC_FULL)
-    append_mpi_env_args
-    return
-  fi
-
-  local ranks="$MPI_RANKS"
-  if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
-    ranks="$(sum_host_slots "$MPI_HOSTS")"
-  fi
-  if [[ -z "$ranks" ]]; then
-    ranks=2
-  fi
-
-  mpi_prefix=(
-    "$MPIEXEC"
-    --map-by "$MPI_MAP_BY"
-    --bind-to "$MPI_BIND_TO"
-    -np "$ranks"
-  )
-  if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
-    mpi_prefix+=(--report-bindings)
-  fi
-  append_mpi_env_args
-  if [[ -n "$MPI_HOSTS" ]]; then
-    mpi_prefix+=(-host "$MPI_HOSTS")
-  elif [[ -n "$MPI_HOSTFILE" ]]; then
-    mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
-  fi
-}
-
-is_local_host() {
-  local host="$1"
-  [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
-  [[ "$host" == "$(hostname)" ]] && return 0
-  [[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
-  hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
-}
-
-sync_trees_to_hosts() {
-  [[ "$SYNC_TREES" == "1" ]] || return 0
-  [[ -n "$SYNC_HOSTS" ]] || return 0
-
-  local src_dir="$TREE_DIR"
-  local dst_dir="$TREE_DIR"
-  if [[ "$TREE_DIR" != /* ]]; then
-    src_dir="$ROOT_DIR/$TREE_DIR"
-    dst_dir="$ROOT_DIR/$TREE_DIR"
-  fi
-
-  for host in $SYNC_HOSTS; do
-    is_local_host "$host" && continue
-    echo "Sync tree dir to $host:$dst_dir"
-    "$SSH_BIN" "$host" "mkdir -p $(printf '%q' "$dst_dir")"
-    if command -v rsync >/dev/null 2>&1; then
-      rsync -a "$src_dir/" "$host:$dst_dir/"
-    else
-      scp -q "$src_dir"/*.pkl "$host:$dst_dir/"
-    fi
-  done
-}
-
-tools/manage_tn_dask_cluster.sh start
-DASK_CLUSTER_MANAGED=1
-wait_for_dask_workers
-
-echo "Search with dask: $DASK_ADDRESS"
-search_args=(
-  --case "$CASE"
-  --nqubits "$NQUBITS"
-  --nlayers "$NLAYERS"
-  --observables $OBSERVABLES
-  --tree-dir "$TREE_DIR"
-  --dask-address "$DASK_ADDRESS"
-  --torch-threads "$TORCH_THREADS"
-  --dtype "$DTYPE"
-  --tn-search-repeats "$SEARCH_REPEATS"
-  --tn-search-time "$SEARCH_TIME"
-  "${tn_slice_args[@]}"
-)
-if [[ -n "$DASK_EXPECTED_WORKERS" ]]; then
-  search_args+=(--dask-expected-workers "$DASK_EXPECTED_WORKERS")
-fi
-if [[ "$TN_DEBUG_TRIALS" == "1" ]]; then
-  search_args+=(--tn-debug-trials)
-fi
-"$PYTHON_BIN" -u tools/tn_contest_runner.py search "${search_args[@]}"
-
-sync_trees_to_hosts
-
-build_mpi_prefix
-echo "Contract with MPI: ${mpi_prefix[*]}"
-if [[ "$TN_CONTRACT_ENV_CHECK" == "1" ]]; then
-  "${mpi_prefix[@]}" "$PYTHON_BIN" -c "from mpi4py import MPI; import os; \
-import torch; \
-rank = MPI.COMM_WORLD.Get_rank(); \
-blis = []; \
-[blis.append(line.strip().split()[-1]) for line in open('/proc/self/maps') if 'libblis' in line and line.strip().split()[-1] not in blis]; \
-print('tn_contract_env ' + \
-      f'rank={rank} ' + \
-      f'LD_PRELOAD={os.environ.get(\"LD_PRELOAD\", \"\")} ' + \
-      f'BLIS_NUM_THREADS={os.environ.get(\"BLIS_NUM_THREADS\", \"\")} ' + \
-      f'OMP_NUM_THREADS={os.environ.get(\"OMP_NUM_THREADS\", \"\")} ' + \
-      f'MKL_NUM_THREADS={os.environ.get(\"MKL_NUM_THREADS\", \"\")} ' + \
-      f'OMP_PROC_BIND={os.environ.get(\"OMP_PROC_BIND\", \"\")} ' + \
-      f'OMP_PLACES={os.environ.get(\"OMP_PLACES\", \"\")} ' + \
-      f'torch_threads={torch.get_num_threads()} ' + \
-      f'blis={\";\".join(blis) if blis else \"missing\"}', flush=True)"
-fi
-"${mpi_prefix[@]}" "$PYTHON_BIN" -u tools/tn_contest_runner.py contract \
-  --mpi \
-  --case "$CASE" \
-  --nqubits "$NQUBITS" \
-  --nlayers "$NLAYERS" \
-  --observables $OBSERVABLES \
-  --tree-dir "$TREE_DIR" \
-  --torch-threads "$TORCH_THREADS" \
-  --dtype "$DTYPE" \
-  "${tn_slice_args[@]}"
diff --git a/tools/run_vidal_mpi_contest_cases.sh b/tools/run_vidal_mpi_contest_cases.sh
deleted file mode 100755
index cee84a4..0000000
--- a/tools/run_vidal_mpi_contest_cases.sh
+++ /dev/null
@@ -1,414 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-# Contest-style Vidal/MPI MPS cases.
-#
-# Usage:
-#   tools/run_vidal_mpi_contest_cases.sh main1
-#   tools/run_vidal_mpi_contest_cases.sh main2
-#   tools/run_vidal_mpi_contest_cases.sh strong
-#   tools/run_vidal_mpi_contest_cases.sh all
-#
-# Common overrides:
-#   PYTHON_BIN=.venv/bin/python
-#   MPIEXEC=mpirun
-#   MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
-#   MPI_RANKS=8
-#   MPI_PE=128
-#   MPI_MAP_BY=ppr:1:numa:PE=128
-#   MPI_BIND_TO=core
-#   MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
-#   HOSTFILE=hostfile                 # optional; used only if the file exists
-#   RANKS=8                           # fallback if MPI_RANKS is not set
-#   TORCH_THREADS=8
-#   CUT_RATIO=1e-12
-#   OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
-#
-# Per-case overrides:
-#   MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
-#   MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
-#   STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$ROOT_DIR"
-
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-MPIEXEC="${MPIEXEC:-mpirun}"
-MPIEXEC_FULL="${MPIEXEC_FULL:-}"
-MPI_HOSTS="${MPI_HOSTS:-}"
-MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
-MPI_RANKS="${MPI_RANKS:-${RANKS:-}}"
-RANKS="${RANKS:-4}"
-TORCH_THREADS="${TORCH_THREADS:-1}"
-MPI_PE="${MPI_PE:-$TORCH_THREADS}"
-MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
-MPI_BIND_TO="${MPI_BIND_TO:-core}"
-MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
-MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
-CUT_RATIO="${CUT_RATIO:-1e-12}"
-OBS_FILTER="${OBS_FILTER:-}"
-export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
-export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
-source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
-
-RUNNER_DIR="$ROOT_DIR/.tmp"
-mkdir -p "$RUNNER_DIR"
-RUNNER="$(mktemp "$RUNNER_DIR/qibotn_vidal_contest.XXXXXX.py")"
-cleanup() {
-  rm -f "$RUNNER"
-}
-trap cleanup EXIT
-
-cat > "$RUNNER" <<'PY'
-from __future__ import annotations
-
-import argparse
-import math
-import time
-
-import numpy as np
-from mpi4py import MPI
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Y, Z
-
-from qibotn.backends.vidal import VidalBackend
-
-
-def set_torch_threads(nthreads):
-    try:
-        import torch
-
-        torch.set_num_threads(nthreads)
-    except Exception:
-        pass
-
-
-def build_circuit(kind, nqubits, nlayers, seed):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-
-    for layer in range(nlayers):
-        for q in range(nqubits):
-            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
-            if kind in ("rxx_rzz", "scramble"):
-                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
-
-        if kind == "reversed_cnot":
-            for q in range(0, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
-            for q in range(1, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
-        elif kind == "rxx_rzz":
-            for q in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
-                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
-        elif kind == "scramble":
-            for q in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
-                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
-                if layer % 5 == 4:
-                    circuit.add(gates.SWAP(q, q + 1))
-        else:
-            raise ValueError(f"Unknown circuit kind {kind!r}.")
-
-    return circuit
-
-
-def ring_xz(nqubits):
-    form = 0
-    for q in range(nqubits):
-        form += 0.5 * X(q) * Z((q + 1) % nqubits)
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def open_zz(nqubits):
-    form = 0
-    for q in range(nqubits - 1):
-        form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def range2_xx(nqubits):
-    form = 0
-    for q in range(nqubits - 2):
-        form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def dense_observable(nqubits, qubits, seed, dim):
-    rng = np.random.default_rng(seed)
-    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
-    matrix = (raw + raw.conj().T) / 2.0
-    matrix = matrix / np.linalg.norm(matrix)
-    return {"matrix": matrix, "qubits": list(qubits)}
-
-
-def observables_for_case(nqubits, seed):
-    q1 = nqubits // 4
-    q2 = nqubits // 2
-    q3 = (3 * nqubits) // 4
-    last = nqubits - 1
-
-    return [
-        ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
-        ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
-        ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
-        (
-            "long_Z_5_sites",
-            hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
-        ),
-        (
-            "mixed_XZYZX",
-            hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
-        ),
-        ("ring_xz", ring_xz(nqubits)),
-        ("open_zz", open_zz(nqubits)),
-        ("range2_xx", range2_xx(nqubits)),
-        ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
-        ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
-        ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
-    ]
-
-
-def run_case(args):
-    set_torch_threads(args.torch_threads)
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
-    observables = observables_for_case(args.nqubits, args.seed)
-    if args.obs_filter:
-        wanted = set(args.obs_filter.split(","))
-        observables = [(name, obs) for name, obs in observables if name in wanted]
-        if not observables:
-            raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
-
-    if rank == 0:
-        print("=" * 88, flush=True)
-        print(
-            "case "
-            f"label={args.label} kind={args.kind} ranks={size} "
-            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
-            f"bond={args.bond} cut_ratio={args.cut_ratio:g} "
-            f"torch_threads={args.torch_threads} seed={args.seed} "
-            f"obs_filter={args.obs_filter or 'all'}",
-            flush=True,
-        )
-        print(
-            "observable value seconds trunc_sum trunc_max status",
-            flush=True,
-        )
-
-    for obs_name, observable in observables:
-        backend = VidalBackend()
-        backend.configure_tn_simulation(
-            max_bond_dimension=args.bond,
-            cut_ratio=args.cut_ratio,
-            tensor_module="torch",
-            mpi_approach="CT",
-            mpi_num_procs=size,
-            fallback=False,
-        )
-
-        comm.Barrier()
-        start = time.perf_counter()
-        try:
-            value = backend.expectation(
-                circuit,
-                observable,
-                preprocess=True,
-                compile_circuit=False,
-            )
-            status = "ok"
-        except Exception as exc:  # pragma: no cover - printed for manual runs
-            value = np.nan
-            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
-        seconds = time.perf_counter() - start
-
-        if rank == 0:
-            print(
-                f"{obs_name} {value!r} {seconds:.3f} "
-                f"{backend.last_truncation_error:.6e} "
-                f"{backend.last_max_truncation_error:.6e} {status}",
-                flush=True,
-            )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--label", required=True)
-    parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
-    parser.add_argument("--nqubits", type=int, required=True)
-    parser.add_argument("--nlayers", type=int, required=True)
-    parser.add_argument("--bond", type=int, required=True)
-    parser.add_argument("--cut-ratio", type=float, required=True)
-    parser.add_argument("--seed", type=int, required=True)
-    parser.add_argument("--torch-threads", type=int, required=True)
-    parser.add_argument("--obs-filter", default="")
-    run_case(parser.parse_args())
-
-
-if __name__ == "__main__":
-    main()
-PY
-
-sum_host_slots() {
-  local hosts="$1"
-  local total=0
-  local item slots
-  IFS=',' read -r -a host_items <<< "$hosts"
-  for item in "${host_items[@]}"; do
-    if [[ "$item" == *:* ]]; then
-      slots="${item##*:}"
-    else
-      slots=1
-    fi
-    total=$((total + slots))
-  done
-  echo "$total"
-}
-
-append_mpi_env_args() {
-  [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
-  mpi_prefix+=(
-    -x "LD_PRELOAD=${LD_PRELOAD:-}"
-    -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
-    -x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
-    -x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
-    -x "OMP_PROC_BIND=$OMP_PROC_BIND"
-    -x "OMP_PLACES=$OMP_PLACES"
-  )
-}
-
-build_mpi_prefix() {
-  if [[ -n "$MPIEXEC_FULL" ]]; then
-    # shellcheck disable=SC2206
-    mpi_prefix=($MPIEXEC_FULL)
-    append_mpi_env_args
-    return
-  fi
-
-  local ranks="$MPI_RANKS"
-  if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
-    ranks="$(sum_host_slots "$MPI_HOSTS")"
-  fi
-  if [[ -z "$ranks" ]]; then
-    ranks="$RANKS"
-  fi
-
-  mpi_prefix=(
-    "$MPIEXEC"
-    --map-by "$MPI_MAP_BY"
-    --bind-to "$MPI_BIND_TO"
-    -np "$ranks"
-  )
-  if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
-    mpi_prefix+=(--report-bindings)
-  fi
-  append_mpi_env_args
-  if [[ -n "$MPI_HOSTS" ]]; then
-    mpi_prefix+=(-host "$MPI_HOSTS")
-  elif [[ -n "$MPI_HOSTFILE" ]]; then
-    mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
-  fi
-}
-
-build_mpi_prefix
-
-run_case() {
-  local label="$1"
-  local kind="$2"
-  local nq="$3"
-  local layers="$4"
-  local bond="$5"
-  local seed="$6"
-
-  echo
-  echo "Running $label: kind=$kind nqubits=$nq layers=$layers bond=$bond seed=$seed"
-  echo "MPI: ${mpi_prefix[*]}"
-  "${mpi_prefix[@]}" "$PYTHON_BIN" -u "$ROOT_DIR/tools/vidal_mpi_contest_runner.py" \
-    --label "$label" \
-    --kind "$kind" \
-    --nqubits "$nq" \
-    --nlayers "$layers" \
-    --bond "$bond" \
-    --cut-ratio "$CUT_RATIO" \
-    --seed "$seed" \
-    --torch-threads "$TORCH_THREADS" \
-    --obs-filter "$(tr ' ' ',' <<< "$OBS_FILTER")"
-}
-
-case "${1:-help}" in
-  main1)
-    run_case \
-      "main1-reversed-cnot" \
-      "reversed_cnot" \
-      "${MAIN1_NQ:-128}" \
-      "${MAIN1_LAYERS:-50}" \
-      "${MAIN1_BOND:-1024}" \
-      "${MAIN1_SEED:-31001}"
-    ;;
-  main2)
-    run_case \
-      "main2-rxx-rzz" \
-      "rxx_rzz" \
-      "${MAIN2_NQ:-128}" \
-      "${MAIN2_LAYERS:-64}" \
-      "${MAIN2_BOND:-2048}" \
-      "${MAIN2_SEED:-31002}"
-    ;;
-  strong)
-    run_case \
-      "strong-scramble" \
-      "scramble" \
-      "${STRONG_NQ:-256}" \
-      "${STRONG_LAYERS:-64}" \
-      "${STRONG_BOND:-2048}" \
-      "${STRONG_SEED:-41001}"
-    ;;
-  all)
-    "$0" main1
-    "$0" main2
-    "$0" strong
-    ;;
-  smoke)
-    MAIN1_NQ="${MAIN1_NQ:-32}" \
-    MAIN1_LAYERS="${MAIN1_LAYERS:-6}" \
-    MAIN1_BOND="${MAIN1_BOND:-128}" \
-    "$0" main1
-    ;;
-  help|*)
-    cat >&2 <<'EOF'
-Usage: tools/run_vidal_mpi_contest_cases.sh [main1|main2|strong|all|smoke]
-
-Cases:
-  main1   128 qubits, 50 layers, reversed-CNOT brickwall, chi=1024
-  main2   128 qubits, 64 layers, RXX/RZZ brickwall, chi=2048
-  strong  256 qubits, 64 layers, RXX/RZZ + periodic SWAP scramble, chi=2048
-  smoke   Small syntax/runtime check of main1
-
-Common overrides:
-  PYTHON_BIN=.venv/bin/python
-  MPIEXEC=mpiexec
-  MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
-  MPI_RANKS=8
-  MPI_PE=128
-  MPI_MAP_BY=ppr:1:numa:PE=128
-  MPI_BIND_TO=core
-  MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
-  HOSTFILE=hostfile
-  RANKS=8
-  TORCH_THREADS=8
-  CUT_RATIO=1e-12
-  OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
-
-Per-case overrides:
-  MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
-  MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
-  STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
-EOF
-    exit 2
-    ;;
-esac
diff --git a/tools/run_vidal_segment_mpi_scan.sh b/tools/run_vidal_segment_mpi_scan.sh
deleted file mode 100755
index 49dc138..0000000
--- a/tools/run_vidal_segment_mpi_scan.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-set -euo pipefail
-
-NQ="${NQ:-34}"
-LAYERS="${LAYERS:-20}"
-BOND="${BOND:-512}"
-SEED="${SEED:-42}"
-RANKS="${RANKS:-1 2 4}"
-THREADS="${THREADS:-32 32 16}"
-PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-MPIEXEC="${MPIEXEC:-mpiexec}"
-CIRCUIT="${CIRCUIT:-brickwall_cnot}"
-OBSERVABLE="${OBSERVABLE:-ring_xz}"
-EXACT="${EXACT:-0}"
-
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-cd "$ROOT_DIR"
-
-if [[ "${1:-help}" != "run" ]]; then
-  cat >&2 <<'EOF'
-Usage: tools/run_vidal_segment_mpi_scan.sh run
-
-Overrides:
-  NQ=34 LAYERS=20 BOND=512 SEED=42
-  RANKS="1 2 4" THREADS="32 32 16"
-  CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz
-  EXACT=1
-  PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec
-EOF
-  if [[ "${1:-help}" == "help" ]]; then
-    exit 0
-  fi
-  exit 2
-fi
-
-read -r -a ranks <<< "$RANKS"
-read -r -a threads <<< "$THREADS"
-
-if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then
-  echo "RANKS and THREADS must have the same number of entries." >&2
-  exit 2
-fi
-
-common=(
-  --nqubits "$NQ"
-  --nlayers "$LAYERS"
-  --bond "$BOND"
-  --seed "$SEED"
-  --mps
-  --circuits "$CIRCUIT"
-  --observables "$OBSERVABLE"
-)
-
-if [[ "$EXACT" == "1" ]]; then
-  common+=(--exact)
-fi
-
-for idx in "${!ranks[@]}"; do
-  nrank="${ranks[$idx]}"
-  nthr="${threads[$idx]}"
-  if [[ "$nrank" == "1" ]]; then
-    echo "== Vidal serial ranks=1 torch_threads=$nthr =="
-    "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      "${common[@]}" --torch-threads "$nthr"
-  else
-    echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr =="
-    "$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \
-      "${common[@]}" --torch-threads "$nthr" --mpi
-  fi
-done
diff --git a/tools/slice_existing_tree.py b/tools/slice_existing_tree.py
deleted file mode 100644
index 4e94e9c..0000000
--- a/tools/slice_existing_tree.py
+++ /dev/null
@@ -1,59 +0,0 @@
-"""Slice an existing saved cotengra tree without re-running path search."""
-
-from __future__ import annotations
-
-import argparse
-import pickle
-from pathlib import Path
-
-from qibotn.parallel import contraction_tree_costs
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("input", help="Input pickle saved by --tn-save-tree.")
-    parser.add_argument("output", help="Output pickle path.")
-    parser.add_argument("--term", type=int, default=0)
-    parser.add_argument("--target-slices", type=int, default=2)
-    parser.add_argument("--max-repeats", type=int, default=64)
-    parser.add_argument("--seed", type=int, default=42)
-    args = parser.parse_args()
-
-    input_path = Path(args.input)
-    output_path = Path(args.output)
-    with input_path.open("rb") as f:
-        payload = pickle.load(f)
-
-    trees = payload["trees"] if isinstance(payload, dict) else payload
-    if not isinstance(trees, (list, tuple)):
-        trees = [trees]
-    tree = trees[args.term]
-
-    print("original", contraction_tree_costs(tree), flush=True)
-    sliced = tree.slice(
-        target_slices=args.target_slices,
-        max_repeats=args.max_repeats,
-        seed=args.seed,
-    )
-    print("sliced", contraction_tree_costs(sliced), flush=True)
-    print(f"sliced_inds={sliced.sliced_inds}", flush=True)
-
-    new_trees = list(trees)
-    new_trees[args.term] = sliced
-
-    if isinstance(payload, dict):
-        out_payload = dict(payload)
-        out_payload["trees"] = new_trees
-        out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees]
-        out_payload["nterms"] = len(new_trees)
-    else:
-        out_payload = new_trees
-
-    output_path.parent.mkdir(parents=True, exist_ok=True)
-    with output_path.open("wb") as f:
-        pickle.dump(out_payload, f)
-    print(f"saved {output_path}", flush=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/tn_contest_runner.py b/tools/tn_contest_runner.py
deleted file mode 100644
index 06ff913..0000000
--- a/tools/tn_contest_runner.py
+++ /dev/null
@@ -1,443 +0,0 @@
-#!/usr/bin/env python
-"""Contest-style CPU TN path search and contraction runner.
-
-This file is intentionally self-contained: define contest circuits and
-observables here, run path search once, then load the saved trees for repeated
-MPI contractions.
-"""
-
-from __future__ import annotations
-
-import argparse
-import math
-import os
-import subprocess
-import sys
-from dataclasses import dataclass
-from pathlib import Path
-from urllib.parse import urlparse
-
-import numpy as np
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Y, Z
-
-ROOT = Path(__file__).resolve().parents[1]
-SRC = ROOT / "src"
-if str(SRC) not in sys.path:
-    sys.path.insert(0, str(SRC))
-
-from qibotn.expectation_runner import (  # noqa: E402
-    ExpectationConfig,
-    exact_for_observable,
-    run_cpu_expectation,
-)
-
-
-@dataclass(frozen=True)
-class CaseSpec:
-    circuit_kind: str
-    observables: tuple[str, ...]
-    nqubits: int
-    nlayers: int
-    seed: int
-    target_slices: int | None = None
-
-
-CASES = {
-    "main1": CaseSpec(
-        circuit_kind="rxx_rzz_chain",
-        observables=("ring_xz",),
-        nqubits=37,
-        nlayers=20,
-        seed=31001,
-        target_slices=None,
-    ),
-    "main2": CaseSpec(
-        circuit_kind="scramble_chain",
-        observables=("open_zz", "range2_xx"),
-        nqubits=36,
-        nlayers=18,
-        seed=31002,
-        target_slices=None,
-    ),
-    "strong": CaseSpec(
-        circuit_kind="reversed_cnot",
-        observables=("ring_xz", "long_z_string"),
-        nqubits=40,
-        nlayers=24,
-        seed=41001,
-        target_slices=None,
-    ),
-}
-
-
-def optional_int(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return int(text)
-
-
-def optional_float(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return float(text)
-
-
-def set_torch_threads(nthreads):
-    try:
-        import torch
-
-        torch.set_num_threads(nthreads)
-    except Exception:
-        pass
-
-
-def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
-    for qubit in range(nqubits):
-        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
-        if include_rx:
-            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
-
-
-def build_circuit(kind, nqubits, nlayers, seed):
-    """Define contest circuits here."""
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-
-    for layer in range(nlayers):
-        if kind == "rxx_rzz_chain":
-            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
-            for qubit in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
-                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
-
-        elif kind == "scramble_chain":
-            add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
-            for qubit in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
-                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
-                if layer % 5 == 4:
-                    circuit.add(gates.SWAP(qubit, qubit + 1))
-
-        elif kind == "reversed_cnot":
-            add_single_qubit_layer(circuit, nqubits, rng)
-            for qubit in range(0, nqubits - 1, 2):
-                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
-                circuit.add(gate)
-            for qubit in range(1, nqubits - 1, 2):
-                gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
-                circuit.add(gate)
-
-        else:
-            raise ValueError(f"Unknown circuit kind {kind!r}.")
-
-    return circuit
-
-
-def pauli_sum_observable(kind, nqubits, seed):
-    """Define contest observables here.
-
-    TN path currently expects Pauli products / SymbolicHamiltonian terms.
-    Keep production contest observables Hermitian unless complex output is
-    explicitly required by the scoring rule.
-    """
-    del seed
-    if kind == "ring_xz":
-        form = 0
-        for qubit in range(nqubits):
-            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-
-    if kind == "open_zz":
-        form = 0
-        for qubit in range(nqubits - 1):
-            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-
-    if kind == "range2_xx":
-        form = 0
-        for qubit in range(nqubits - 2):
-            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-
-    if kind == "long_z_string":
-        stride = max(1, nqubits // 16)
-        form = None
-        for qubit in range(0, nqubits, stride):
-            form = Z(qubit) if form is None else form * Z(qubit)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-
-    if kind == "mixed_local":
-        q1 = nqubits // 4
-        q2 = nqubits // 2
-        q3 = (3 * nqubits) // 4
-        form = 0.25 * X(0) - 0.5 * Z(nqubits - 1)
-        form += 0.125 * X(q1) * Z(q2) * Y(q3)
-        return hamiltonians.SymbolicHamiltonian(form=form)
-
-    raise ValueError(f"Unknown observable kind {kind!r}.")
-
-
-def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices):
-    slice_label = "auto" if target_slices is None else f"s{target_slices}"
-    return (
-        Path(tree_dir)
-        / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}.pkl"
-    )
-
-
-def build_parallel_opts(args, tree_file=None, search_only=False):
-    slicing_opts = {}
-    if args.tn_target_slices is not None:
-        slicing_opts["target_slices"] = args.tn_target_slices
-    if args.tn_target_size is not None:
-        slicing_opts["target_size"] = args.tn_target_size
-
-    opts = {
-        "slicing_opts": slicing_opts or None,
-        "search_workers": args.tn_search_workers or args.torch_threads,
-        "max_repeats": args.tn_search_repeats,
-        "max_time": args.tn_search_time,
-        "print_stats": False,
-    }
-    if args.tn_search_backend is not None:
-        opts["search_backend"] = args.tn_search_backend
-    if args.dask_address is not None:
-        opts["dask_address"] = args.dask_address
-    if args.dask_expected_workers is not None:
-        opts["dask_expected_workers"] = args.dask_expected_workers
-    if args.dask_close_workers:
-        opts["dask_close_workers"] = True
-    if args.tn_debug_trials:
-        opts["debug_trials"] = True
-    if search_only:
-        opts["search_only"] = True
-        opts["save_tree_path"] = str(tree_file)
-    elif tree_file is not None:
-        opts["load_tree_path"] = str(tree_file)
-    return opts
-
-
-def run_one(args, case_name, obs_name, mode):
-    case = CASES[case_name]
-    circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
-    observable = pauli_sum_observable(obs_name, args.nqubits, args.seed)
-    path = tree_path(
-        args.tree_dir,
-        case_name,
-        obs_name,
-        args.nqubits,
-        args.nlayers,
-        args.tn_target_slices,
-    )
-    path.parent.mkdir(parents=True, exist_ok=True)
-
-    rank = 0
-    if args.mpi:
-        from mpi4py import MPI
-
-        rank = MPI.COMM_WORLD.Get_rank()
-
-    if rank == 0:
-        print("=" * 88, flush=True)
-        print(
-            f"mode={mode} case={case_name} circuit={case.circuit_kind} "
-            f"observable={obs_name} nqubits={args.nqubits} nlayers={args.nlayers} "
-            f"seed={args.seed} gates={len(circuit.queue)} tree={path}",
-            flush=True,
-        )
-
-    if mode == "contract" and not path.exists():
-        raise FileNotFoundError(f"Missing tree file: {path}. Run search first.")
-
-    exact = None
-    if args.exact and rank == 0 and mode != "search":
-        if args.nqubits > args.exact_max_qubits:
-            raise ValueError(
-                f"--exact is limited to {args.exact_max_qubits} qubits by default."
-            )
-        exact = exact_for_observable(circuit, observable, args.nqubits)
-
-    config = ExpectationConfig(
-        ansatz="tn",
-        mpi=args.mpi,
-        bond=args.bond,
-        cut_ratio=args.cut_ratio,
-        tensor_module="torch",
-        quimb_backend=args.quimb_backend,
-        dtype=args.dtype,
-        torch_threads=args.torch_threads,
-        parallel_opts=build_parallel_opts(
-            args,
-            tree_file=path,
-            search_only=(mode == "search"),
-        ),
-    )
-    result = run_cpu_expectation(circuit, observable, config)
-    if args.mpi and result.rank != 0:
-        return
-
-    if mode == "search":
-        print(f"searched observable={obs_name} tree={path}", flush=True)
-    else:
-        abs_error = float("nan") if exact is None else abs(result.value - exact)
-        rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
-        exact_text = "nan" if exact is None else f"{exact:.16e}"
-        print(
-            f"result observable={obs_name} exact={exact_text} "
-            f"value={result.value:.16e} abs_error={abs_error:.6e} "
-            f"rel_error={rel_error:.6e} seconds={result.seconds:.3f}",
-            flush=True,
-        )
-
-    for stat in result.parallel_stats or ():
-        cost = stat["path_cost"]
-        search_stats = stat.get("search_stats", {})
-        print(
-            "tn_term_summary "
-            f"observable={obs_name} "
-            f"term={stat.get('term_index', 0)} "
-            f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
-            f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
-            f"completed_trials={search_stats.get('completed_trials', 'na')} "
-            f"finite_trials={search_stats.get('finite_trials', 'na')} "
-            f"failed_trials={search_stats.get('failed_trials', 'na')} "
-            f"requested_trials={search_stats.get('requested_trials', 'na')} "
-            f"best_score={search_stats.get('best_score', float('nan')):.6g} "
-            f"slices={cost.get('nslices')} "
-            f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
-            f"log10_write={cost.get('log10_write', float('nan')):.3f} "
-            f"log2_size={cost.get('log2_size', float('nan')):.3f} "
-            f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
-            f"rank_slices={stat.get('rank_slices')}",
-            flush=True,
-        )
-
-
-def selected_observables(args, case):
-    if args.observables:
-        return tuple(args.observables)
-    if args.obs_filter:
-        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
-    return case.observables
-
-
-def apply_case_defaults(args):
-    case = CASES[args.case]
-    if args.nqubits is None:
-        args.nqubits = case.nqubits
-    if args.nlayers is None:
-        args.nlayers = case.nlayers
-    if args.seed is None:
-        args.seed = case.seed
-    if args.tn_target_slices is None:
-        args.tn_target_slices = case.target_slices
-    args.observables = selected_observables(args, case)
-
-
-def stop_dask_cluster(args):
-    if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address:
-        return
-    if args.mpi:
-        from mpi4py import MPI
-
-        if MPI.COMM_WORLD.Get_rank() != 0:
-            return
-    script = ROOT / "tools" / "manage_tn_dask_cluster.sh"
-    if not script.exists():
-        print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
-        return
-
-    env = os.environ.copy()
-    parsed = urlparse(args.dask_address)
-    if parsed.hostname:
-        env.setdefault("SCHEDULER_HOST", parsed.hostname)
-    if parsed.port:
-        env.setdefault("SCHEDULER_PORT", str(parsed.port))
-
-    print("dask_stop_after_search start", flush=True)
-    subprocess.run([str(script), "stop"], cwd=str(ROOT), env=env, check=False)
-    print("dask_stop_after_search done", flush=True)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("mode", choices=("search", "contract", "all", "validate", "list"))
-    parser.add_argument("--case", choices=sorted(CASES), default="main1")
-    parser.add_argument("--observables", nargs="+")
-    parser.add_argument("--obs-filter", default="")
-    parser.add_argument("--tree-dir", default="trees/contest_tn")
-    parser.add_argument("--nqubits", type=int)
-    parser.add_argument("--nlayers", type=int)
-    parser.add_argument("--seed", type=int)
-    parser.add_argument("--mpi", action="store_true")
-    parser.add_argument("--exact", action="store_true")
-    parser.add_argument("--exact-max-qubits", type=int, default=24)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
-    parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
-    parser.add_argument("--torch-threads", type=int, default=8)
-    parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
-    parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64")
-    parser.add_argument("--tn-target-slices", type=int)
-    parser.add_argument("--tn-target-size", type=int, default=2**34)
-    parser.add_argument("--tn-search-workers", type=int)
-    parser.add_argument("--tn-search-repeats", type=int, default=2048)
-    parser.add_argument("--tn-search-time", type=float, default=300.0)
-    parser.add_argument(
-        "--tn-search-backend",
-        choices=("processpool", "dask"),
-        default="dask",
-        help=(
-            "Path-search backend. Defaults to dask. Without --dask-address, "
-            "non-MPI search starts a local dask cluster."
-        ),
-    )
-    parser.add_argument("--dask-address")
-    parser.add_argument("--dask-expected-workers", type=int)
-    parser.add_argument("--dask-close-workers", action="store_true")
-    parser.add_argument(
-        "--keep-dask",
-        action="store_true",
-        help=(
-            "Keep an external dask cluster running after search. By default, "
-            "tools/manage_tn_dask_cluster.sh stop is called after search when "
-            "--dask-address is used."
-        ),
-    )
-    parser.add_argument(
-        "--tn-debug-trials",
-        action="store_true",
-        help="Print dask worker summary and per-trial start/done logs.",
-    )
-    parser.add_argument("--no-tn-stats", action="store_true")
-    args = parser.parse_args()
-
-    if args.mode == "list":
-        for name, case in CASES.items():
-            print(
-                f"{name}: circuit={case.circuit_kind} "
-                f"observables={','.join(case.observables)} "
-                f"nqubits={case.nqubits} nlayers={case.nlayers} "
-                f"seed={case.seed} target_slices={case.target_slices}"
-            )
-        return
-
-    apply_case_defaults(args)
-    set_torch_threads(args.torch_threads)
-
-    modes = ("search", "contract") if args.mode == "all" else (args.mode,)
-    if args.mode == "validate":
-        args.exact = True
-        args.nqubits = min(args.nqubits, args.exact_max_qubits)
-        modes = ("search", "contract")
-
-    for mode in modes:
-        for obs_name in args.observables:
-            run_one(args, args.case, obs_name, mode)
-        if mode == "search":
-            stop_dask_cluster(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/torch_profile_tn_complex64.py b/tools/torch_profile_tn_complex64.py
deleted file mode 100644
index b7392f9..0000000
--- a/tools/torch_profile_tn_complex64.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""Run the 34q/20L TN complex64 benchmark under torch.profiler briefly."""
-
-from __future__ import annotations
-
-import argparse
-import os
-import signal
-import sys
-from pathlib import Path
-
-from mpi4py import MPI
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--seconds", type=float, default=30.0)
-    parser.add_argument("--out-dir", default="torch_profiles/tn_complex64")
-    parser.add_argument("--torch-threads", type=int, default=48)
-    args = parser.parse_args()
-
-    repo_root = Path(__file__).resolve().parents[1]
-    os.chdir(repo_root)
-    sys.path.insert(0, str(repo_root))
-
-    import torch
-    from torch.profiler import ProfilerActivity, profile
-
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-    out_dir = Path(args.out_dir)
-    if rank == 0:
-        out_dir.mkdir(parents=True, exist_ok=True)
-    comm.Barrier()
-
-    torch.set_num_threads(args.torch_threads)
-
-    def run_benchmark():
-        import benchmark_cpu_expectation
-
-        sys.argv = [
-            "benchmark_cpu_expectation.py",
-            "--mpi",
-            "--ansatz",
-            "tn",
-            "--nqubits",
-            "34",
-            "--nlayers",
-            "20",
-            "--circuits",
-            "rxx_rzz",
-            "--pauli-pattern",
-            "XZ",
-            "--tn-load-tree",
-            "trees/rxx_rzz_34q20l_s4.pkl",
-            "--quimb-backend",
-            "torch",
-            "--torch-threads",
-            str(args.torch_threads),
-            "--dtype",
-            "complex64",
-        ]
-        benchmark_cpu_expectation.main()
-
-    trace_path = out_dir / f"rank{rank}_trace.json"
-    stacks_path = out_dir / f"rank{rank}_stacks.txt"
-    summary_path = out_dir / f"rank{rank}_summary.txt"
-
-    prof = profile(
-        activities=[ProfilerActivity.CPU],
-        record_shapes=True,
-        profile_memory=True,
-        with_stack=True,
-    )
-
-    class ProfileTimeout(Exception):
-        pass
-
-    def alarm_handler(signum, frame):
-        raise ProfileTimeout()
-
-    old_handler = signal.signal(signal.SIGALRM, alarm_handler)
-    signal.setitimer(signal.ITIMER_REAL, args.seconds)
-    try:
-        with prof:
-            try:
-                run_benchmark()
-            except ProfileTimeout:
-                pass
-    finally:
-        signal.setitimer(signal.ITIMER_REAL, 0)
-        signal.signal(signal.SIGALRM, old_handler)
-
-    prof.export_chrome_trace(str(trace_path))
-    try:
-        prof.export_stacks(str(stacks_path), "self_cpu_time_total")
-    except Exception as exc:  # pragma: no cover - diagnostic only
-        stacks_path.write_text(f"export_stacks failed: {exc}\n", encoding="utf-8")
-
-    summary = prof.key_averages(group_by_stack_n=5).table(
-        sort_by="self_cpu_time_total",
-        row_limit=40,
-    )
-    summary_path.write_text(summary, encoding="utf-8")
-
-    print(
-        f"torch_profile_done rank={rank}/{size} "
-        f"trace={trace_path} summary={summary_path}",
-        flush=True,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/validate_vidal_mpi_correctness.py b/tools/validate_vidal_mpi_correctness.py
deleted file mode 100644
index bce8e2d..0000000
--- a/tools/validate_vidal_mpi_correctness.py
+++ /dev/null
@@ -1,202 +0,0 @@
-"""Correctness checks for the Vidal/TEBD MPS fast path.
-
-The cases here intentionally cover more than the benchmark ring-XZ observable:
-different nearest-neighbor gate orientations and several Pauli-sum observables.
-Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare
-the segmented Vidal executor.
-"""
-
-from __future__ import annotations
-
-import argparse
-import math
-import time
-
-import numpy as np
-import torch
-from qibo import Circuit, gates
-
-from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
-from qibotn.backends.vidal_tebd import VidalTEBDExecutor
-
-
-def build_circuit(kind, nqubits, nlayers, seed):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-    for layer in range(nlayers):
-        for q in range(nqubits):
-            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
-            if kind == "rx_ry_cz":
-                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
-
-        if kind in ("brickwall", "reversed_cnot"):
-            for q in range(0, nqubits - 1, 2):
-                if kind == "reversed_cnot" and (layer % 2):
-                    circuit.add(gates.CNOT(q + 1, q))
-                else:
-                    circuit.add(gates.CNOT(q, q + 1))
-            for q in range(1, nqubits - 1, 2):
-                if kind == "reversed_cnot" and not (layer % 2):
-                    circuit.add(gates.CNOT(q + 1, q))
-                else:
-                    circuit.add(gates.CNOT(q, q + 1))
-        elif kind == "rx_ry_cz":
-            for q in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.CZ(q, q + 1))
-        else:
-            raise ValueError(f"Unknown circuit kind {kind!r}.")
-    return circuit
-
-
-def observable_terms(kind, nqubits):
-    if kind == "ring_xz":
-        return [
-            (0.5, (("X", site), ("Z", (site + 1) % nqubits)))
-            for site in range(nqubits)
-        ]
-    if kind == "open_zz":
-        return [
-            (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
-            for site in range(nqubits - 1)
-        ]
-    if kind == "mixed_local":
-        terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
-        terms += [
-            (0.125, (("Y", site), ("Y", site + 1)))
-            for site in range(0, nqubits - 1, 3)
-        ]
-        return terms
-    raise ValueError(f"Unknown observable kind {kind!r}.")
-
-
-def exact_pauli_sum(circuit, terms, nqubits):
-    state = circuit().state(numpy=True).reshape(-1)
-    indices = np.arange(state.size, dtype=np.int64)
-    value = 0.0 + 0.0j
-    for coeff, ops in terms:
-        flipped = indices.copy()
-        phase = np.ones(state.size, dtype=np.complex128)
-        for name, site in ops:
-            shift = nqubits - 1 - site
-            bit = (indices >> shift) & 1
-            name = name.upper()
-            if name == "X":
-                flipped ^= 1 << shift
-            elif name == "Y":
-                flipped ^= 1 << shift
-                phase *= 1j * (1 - 2 * bit)
-            elif name == "Z":
-                phase *= 1 - 2 * bit
-            elif name != "I":
-                raise ValueError(f"Unsupported Pauli {name!r}.")
-        value += coeff * np.vdot(state[flipped], phase * state)
-    return float(value.real)
-
-
-def run_vidal(circuit, terms, nqubits, bond, tensor_module):
-    executor = VidalTEBDExecutor(
-        nqubits=nqubits,
-        max_bond=bond,
-        cut_ratio=1e-12,
-        tensor_module=tensor_module,
-    )
-    executor.run_circuit(circuit)
-    return float(executor.expectation_pauli_sum(terms))
-
-
-def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm):
-    executor = SegmentVidalMPIExecutor(
-        nqubits=nqubits,
-        max_bond=bond,
-        cut_ratio=1e-12,
-        tensor_module=tensor_module,
-        comm=comm,
-    )
-    executor.run_circuit(circuit)
-    return executor.expectation_pauli_sum_root(terms)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nqubits", type=int, default=16)
-    parser.add_argument("--nlayers", type=int, default=6)
-    parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch")
-    parser.add_argument("--torch-threads", type=int, default=32)
-    parser.add_argument("--mpi", action="store_true")
-    parser.add_argument(
-        "--circuits",
-        nargs="+",
-        default=("brickwall", "reversed_cnot", "rx_ry_cz"),
-    )
-    parser.add_argument(
-        "--observables",
-        nargs="+",
-        default=("ring_xz", "open_zz", "mixed_local"),
-    )
-    args = parser.parse_args()
-
-    torch.set_num_threads(args.torch_threads)
-    comm = None
-    rank = 0
-    size = 1
-    if args.mpi:
-        from mpi4py import MPI
-
-        comm = MPI.COMM_WORLD
-        rank = comm.Get_rank()
-        size = comm.Get_size()
-
-    if rank == 0:
-        mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal"
-        print(
-            f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} "
-            f"bond={args.bond} tensor_module={args.tensor_module}"
-        )
-        print("circuit observable exact value abs_error seconds")
-
-    for circuit_kind in args.circuits:
-        circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
-        exact = None
-        if rank == 0:
-            exact_values = {
-                obs: exact_pauli_sum(
-                    circuit, observable_terms(obs, args.nqubits), args.nqubits
-                )
-                for obs in args.observables
-            }
-        else:
-            exact_values = None
-        if comm is not None:
-            exact_values = comm.bcast(exact_values, root=0)
-
-        for obs_kind in args.observables:
-            terms = observable_terms(obs_kind, args.nqubits)
-            start = time.perf_counter()
-            if args.mpi:
-                value = run_segment_mpi(
-                    circuit,
-                    terms,
-                    args.nqubits,
-                    args.bond,
-                    args.tensor_module,
-                    comm,
-                )
-            else:
-                value = run_vidal(
-                    circuit, terms, args.nqubits, args.bond, args.tensor_module
-                )
-            if rank != 0:
-                continue
-            elapsed = time.perf_counter() - start
-            exact = exact_values[obs_kind]
-            print(
-                f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} "
-                f"{abs(value - exact):.6e} {elapsed:.3f}"
-            )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tools/vidal_mpi_contest_runner.py b/tools/vidal_mpi_contest_runner.py
deleted file mode 100644
index 405f47c..0000000
--- a/tools/vidal_mpi_contest_runner.py
+++ /dev/null
@@ -1,209 +0,0 @@
-from __future__ import annotations
-
-import argparse
-import math
-import time
-
-import numpy as np
-from mpi4py import MPI
-from qibo import Circuit, gates, hamiltonians
-from qibo.symbols import X, Y, Z
-
-from qibotn.backends.vidal import VidalBackend
-
-
-def optional_int(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return int(text)
-
-
-def optional_float(text):
-    if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
-        return None
-    return float(text)
-
-
-def format_optional(value, fmt="g"):
-    return "None" if value is None else format(value, fmt)
-
-
-def set_torch_threads(nthreads):
-    try:
-        import torch
-
-        torch.set_num_threads(nthreads)
-    except Exception:
-        pass
-
-
-def build_circuit(kind, nqubits, nlayers, seed):
-    rng = np.random.default_rng(seed)
-    circuit = Circuit(nqubits)
-
-    for layer in range(nlayers):
-        for q in range(nqubits):
-            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
-            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
-            if kind in ("rxx_rzz", "scramble"):
-                circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
-
-        if kind == "reversed_cnot":
-            for q in range(0, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
-            for q in range(1, nqubits - 1, 2):
-                circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
-        elif kind == "rxx_rzz":
-            for q in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
-                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
-        elif kind == "scramble":
-            for q in range(layer % 2, nqubits - 1, 2):
-                circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
-                circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
-                if layer % 5 == 4:
-                    circuit.add(gates.SWAP(q, q + 1))
-        else:
-            raise ValueError(f"Unknown circuit kind {kind!r}.")
-
-    return circuit
-
-
-def ring_xz(nqubits):
-    form = 0
-    for q in range(nqubits):
-        form += 0.5 * X(q) * Z((q + 1) % nqubits)
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def open_zz(nqubits):
-    form = 0
-    for q in range(nqubits - 1):
-        form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def range2_xx(nqubits):
-    form = 0
-    for q in range(nqubits - 2):
-        form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
-    return hamiltonians.SymbolicHamiltonian(form=form)
-
-
-def dense_observable(nqubits, qubits, seed, dim):
-    rng = np.random.default_rng(seed)
-    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
-    matrix = (raw + raw.conj().T) / 2.0
-    matrix = matrix / np.linalg.norm(matrix)
-    return {"matrix": matrix, "qubits": list(qubits)}
-
-
-def observables_for_case(nqubits, seed):
-    q1 = nqubits // 4
-    q2 = nqubits // 2
-    q3 = (3 * nqubits) // 4
-    last = nqubits - 1
-
-    return [
-        ("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
-        ("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
-        ("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
-        (
-            "long_Z_5_sites",
-            hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
-        ),
-        (
-            "mixed_XZYZX",
-            hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
-        ),
-        ("ring_xz", ring_xz(nqubits)),
-        ("open_zz", open_zz(nqubits)),
-        ("range2_xx", range2_xx(nqubits)),
-        ("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
-        ("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
-        ("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
-    ]
-
-
-def run_case(args):
-    set_torch_threads(args.torch_threads)
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
-    observables = observables_for_case(args.nqubits, args.seed)
-    if args.obs_filter:
-        wanted = set(args.obs_filter.split(","))
-        observables = [(name, obs) for name, obs in observables if name in wanted]
-        if not observables:
-            raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
-
-    if rank == 0:
-        print("=" * 88, flush=True)
-        print(
-            "case "
-            f"label={args.label} kind={args.kind} ranks={size} "
-            f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
-            f"bond={format_optional(args.bond)} "
-            f"cut_ratio={format_optional(args.cut_ratio)} "
-            f"torch_threads={args.torch_threads} seed={args.seed} "
-            f"obs_filter={args.obs_filter or 'all'}",
-            flush=True,
-        )
-        print(
-            "observable value seconds trunc_sum trunc_max status",
-            flush=True,
-        )
-
-    for obs_name, observable in observables:
-        backend = VidalBackend()
-        backend.configure_tn_simulation(
-            max_bond_dimension=args.bond,
-            cut_ratio=args.cut_ratio,
-            tensor_module="torch",
-            mpi_approach="CT",
-            mpi_num_procs=size,
-            fallback=False,
-        )
-
-        comm.Barrier()
-        start = time.perf_counter()
-        try:
-            value = backend.expectation(
-                circuit,
-                observable,
-                preprocess=True,
-                compile_circuit=False,
-            )
-            status = "ok"
-        except Exception as exc:  # pragma: no cover - printed for manual runs
-            value = np.nan
-            status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
-        seconds = time.perf_counter() - start
-
-        if rank == 0:
-            print(
-                f"{obs_name} {value!r} {seconds:.3f} "
-                f"{backend.last_truncation_error:.6e} "
-                f"{backend.last_max_truncation_error:.6e} {status}",
-                flush=True,
-            )
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--label", required=True)
-    parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
-    parser.add_argument("--nqubits", type=int, required=True)
-    parser.add_argument("--nlayers", type=int, required=True)
-    parser.add_argument("--bond", type=optional_int, required=True)
-    parser.add_argument("--cut-ratio", type=optional_float, required=True)
-    parser.add_argument("--seed", type=int, required=True)
-    parser.add_argument("--torch-threads", type=int, required=True)
-    parser.add_argument("--obs-filter", default="")
-    run_case(parser.parse_args())
-
-
-if __name__ == "__main__":
-    main()
diff --git a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl
deleted file mode 100644
index e41f1f5..0000000
Binary files a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl and /dev/null differ