diff --git a/docs/contest_runners.md b/docs/contest_runners.md
index 8db6ddf..5298328 100644
--- a/docs/contest_runners.md
+++ b/docs/contest_runners.md
@@ -1,30 +1,40 @@
 # TN
 ```bash
-# qibotn目录下
-I_MPI_FABRICS=shm:ofi \
-I_MPI_OFI_PROVIDER=tcp \
-FI_PROVIDER=tcp \
+# search + contract，Open MPI 多节点：每节点 2 rank，每 rank 绑定 1 个 NUMA。
+# MPI_HOSTS 里每个节点写 :2，MPI_RANKS = 节点数 * 2。
+# 每个 rank 使用 MPI_PE 个 core；这台 2-NUMA AMD 节点用 MPI_PE=128。
+
+NQUBITS=40 \
+TN_DEBUG_TRIALS=1 \
+SCHEDULER_HOST=10.20.1.100 \
+DASK_ADDRESS=tcp://10.20.1.100:8786 \
+WORKER_HOSTS="10.20.1.100 10.20.1.101 10.20.1.102 10.20.1.103" \
 CASE=main1 \
 OBSERVABLES=long_z_string \
-NQUBITS=34 \
-NLAYERS=20 \
-TORCH_THREADS=48 \
-SEARCH_REPEATS=2048 \
-SEARCH_TIME=300 \
-SCHEDULER_HOST=10.20.1.103 \
-WORKER_HOSTS="10.20.1.103 10.20.6.101" \
-DASK_ADDRESS="tcp://10.20.1.103:8786" \
-NWORKERS=84 \
-NTHREADS=1 \
-MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2" \
+TORCH_THREADS=80 \
+MPI_PE=80 \
+MPI_MAP_BY=ppr:1:numa:PE=80 \
+MPI_BIND_TO=core \
+OMP_NUM_THREADS=80 \
+MKL_NUM_THREADS=80 \
+BLIS_NUM_THREADS=80 \
+MPI_HOSTS="node-0:2,node-1:2,node-2:2,node-3:2" \
+MPI_RANKS=8 \
+NWORKERS=96 \
+TN_TARGET_SIZE=17179869184 \
 tools/run_tn_dask_mpi_all.sh
 
 # 单独缩并contract计算
 
-I_MPI_FABRICS=shm:ofi \
-I_MPI_OFI_PROVIDER=tcp \
-FI_PROVIDER=tcp \
-mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
+mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
+  -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
+  -x BLIS_NUM_THREADS=80 \
+  -x OMP_NUM_THREADS=80 \
+  -x MKL_NUM_THREADS=80 \
+  -x OMP_PROC_BIND=close \
+  -x OMP_PLACES=cores \
+  -np 8 \
+  -host node-0:2,node-1:2,node-2:2,node-3:2 \
   .venv/bin/python -u tools/tn_contest_runner.py contract \
   --mpi \
   --case main1 \
@@ -32,22 +42,47 @@ mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
   --nlayers 20 \
   --observables long_z_string \
   --tree-dir trees/contest_tn \
-  --torch-threads 48 \
+  --torch-threads 80 \
   --dtype complex64
 ```
 
 # MPS
 ```
-cd /home/yx/qibotn
+cd /home/qibo/qibotn
 
-I_MPI_FABRICS=shm:ofi \
-I_MPI_OFI_PROVIDER=tcp \
-FI_PROVIDER=tcp \
-MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2" \
+MPIEXEC=mpirun \
+MPI_HOSTS="node-2:4,node-3:4" \
+MPI_RANKS=8 \
+MPI_PE=48 \
+MPI_MAP_BY=ppr:2:numa:PE=48 \
+MPI_BIND_TO=core \
+MPI_REPORT_BINDINGS=1 \
 TORCH_THREADS=48 \
+OMP_NUM_THREADS=48 \
+MKL_NUM_THREADS=48 \
+BLIS_NUM_THREADS=48 \
 OBS_FILTER=ring_xz \
 MAIN1_NQ=128 \
 MAIN1_LAYERS=24 \
 MAIN1_BOND=1024 \
 tools/run_vidal_mpi_contest_cases.sh main1
-```
\ No newline at end of file
+
+
+
+MPIEXEC=mpirun \
+MPI_HOSTS="node-2:4" \
+MPI_RANKS=4 \
+MPI_PE=48 \
+MPI_MAP_BY=ppr:2:numa:PE=48 \
+MPI_BIND_TO=core \
+MPI_REPORT_BINDINGS=1 \
+TORCH_THREADS=48 \
+OMP_NUM_THREADS=48 \
+MKL_NUM_THREADS=48 \
+BLIS_NUM_THREADS=48 \
+OBS_FILTER=ring_xz \
+MAIN1_NQ=128 \
+MAIN1_LAYERS=24 \
+MAIN1_BOND=1024 \
+tools/run_vidal_mpi_contest_cases.sh main1
+```
diff --git a/docs/xianchang.md b/docs/xianchang.md
new file mode 100644
index 0000000..57411cc
--- /dev/null
+++ b/docs/xianchang.md
@@ -0,0 +1,42 @@
+mpirun --map-by ppr:1:numa:PE=80 --bind-to core --report-bindings \
+  -x LD_PRELOAD=/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5 \
+  -x BLIS_NUM_THREADS=80 \
+  -x OMP_NUM_THREADS=80 \
+  -x MKL_NUM_THREADS=80 \
+  -x OMP_PROC_BIND=close \
+  -x OMP_PLACES=cores \
+  -np 4 \
+  -host node-0:2,node-1:2,node-2:2,node-3:2 \
+  .venv/bin/python -u tools/tn_contest_runner.py contract \
+  --mpi \
+  --case main1 \
+  --nqubits 34 \
+  --nlayers 20 \
+  --observables long_z_string \
+  --tree-dir trees/contest_tn \
+  --torch-threads 80 \
+  --dtype complex64
+
+
+SEARCH_TIME=300  NQUBITS=40 TN_DEBUG_TRIALS=1 SCHEDULER_HOST=10.20.1.102 DASK_ADDRESS=tcp://10.20.1.102:8786 WORKER_HOSTS="10.20.1.102 10.20.1.103" CASE=main1 OBSERVABLES=long_z_string TORCH_THREADS=80 MPI_PE=80 MPI_MAP_BY=ppr:1:numa:PE=80 MPI_BIND_TO=core OMP_NUM_THREADS=80 MKL_NUM_THREADS=80 BLIS_NUM_THREADS=80 MPI_HOSTS="node-2:2,node-3:2" MPI_RANKS=4 NWORKERS=128  TN_TARGET_SIZE=17179869184 tools/run_tn_dask_mpi_all.sh
+
+
+NQUBITS=40 \
+TN_DEBUG_TRIALS=1 \
+SCHEDULER_HOST=10.20.1.102 \
+DASK_ADDRESS=tcp://10.20.1.102:8786 \
+WORKER_HOSTS="10.20.1.102 10.20.1.103" \
+CASE=main1 \
+OBSERVABLES=long_z_string \
+TORCH_THREADS=80 \
+MPI_PE=80 \
+MPI_MAP_BY=ppr:1:numa:PE=80 \
+MPI_BIND_TO=core \
+OMP_NUM_THREADS=80 \
+MKL_NUM_THREADS=80 \
+BLIS_NUM_THREADS=80 \
+MPI_HOSTS="node-2:2,node-3:2" \
+MPI_RANKS=4 \
+NWORKERS=96 \
+TN_TARGET_SIZE=17179869184 \
+tools/run_tn_dask_mpi_all.sh
\ No newline at end of file
diff --git a/hostfile b/hostfile
index 19358eb..ea699da 100644
--- a/hostfile
+++ b/hostfile
@@ -1,2 +1,4 @@
-10.20.1.103:2
-10.20.6.101:2
+10.20.1.100
+10.20.1.101
+10.20.1.102
+10.20.1.103
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..7ac26d8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,138 @@
+alembic==1.18.4
+annotated-types==0.7.0
+antlr4-python3-runtime==4.13.2
+anyio==4.13.0
+asttokens==3.0.1
+attrs==26.1.0
+autoray==0.8.10
+beautifulsoup4==4.14.3
+certifi==2026.4.22
+cffi==2.0.0
+charset-normalizer==3.4.7
+click==8.3.3
+cloudpickle==3.1.2
+cma==3.4.0
+colorlog==6.10.1
+contourpy==1.3.3
+cotengra==0.7.5
+coverage==7.13.5
+cryptography==47.0.0
+cycler==0.12.1
+cytoolz==1.1.0
+dask==2026.3.0
+decorator==5.2.1
+dill==0.4.1
+distributed==2026.3.0
+executing==2.2.1
+filelock==3.25.2
+fonttools==4.62.1
+fsspec==2026.2.0
+greenlet==3.3.2
+h11==0.16.0
+h5py==3.16.0
+html5lib==1.1
+httpcore==1.0.9
+httpx==0.27.2
+httpx-sse==0.4.3
+idna==3.13
+igraph==1.0.0
+iniconfig==2.3.0
+ipython==8.39.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.3
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+kahypar==1.3.7
+kiwisolver==1.5.0
+llvmlite==0.44.0
+locket==1.0.0
+lxml==6.1.0
+Mako==1.3.10
+markdownify==1.2.2
+MarkupSafe==3.0.3
+matplotlib==3.10.8
+matplotlib-inline==0.2.1
+mcp==1.27.0
+mcp-server-fetch==2025.4.7
+mpi4py==4.1.1
+mpmath==1.3.0
+msgpack==1.1.2
+networkx==3.6.1
+numba==0.61.2
+numpy==2.0.1
+openqasm3==1.0.1
+opt_einsum==3.4.0
+optuna==4.8.0
+packaging==26.0
+parso==0.8.6
+partd==1.4.2
+pexpect==4.9.0
+pillow==12.2.0
+pluggy==1.6.0
+prompt_toolkit==3.0.52
+Protego==0.6.0
+protobuf==7.34.1
+psutil==5.9.8
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.2
+pycparser==3.0
+pydantic==2.13.3
+pydantic-settings==2.14.0
+pydantic_core==2.46.3
+Pygments==2.20.0
+PyJWT==2.12.1
+pyparsing==3.3.2
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-env==1.6.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.2
+python-multipart==0.0.26
+PyYAML==6.0.3
+qibo==0.3.2
+qibojit==0.1.15
+-e git+https://git.nudt.space/jaunatisblue/qibotn.git@4c7a10d026d514897dcc501b507fa604fb4e52d4#egg=qibotn
+qiskit==1.4.5
+qmatchatea==1.5.8
+qredtea==0.3.15
+qtealeaves==1.7.32
+quimb==1.13.0
+ray==2.55.1
+readabilipy==0.3.0
+referencing==0.37.0
+regex==2026.4.4
+requests==2.33.1
+rpds-py==0.30.0
+rustworkx==0.17.1
+scipy==1.17.1
+setuptools==70.2.0
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.8.3
+SQLAlchemy==2.0.49
+sse-starlette==3.4.1
+stack-data==0.6.3
+starlette==1.0.0
+stevedore==5.7.0
+symengine==0.13.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.2.2
+texttable==1.7.0
+threadpoolctl==3.6.0
+toolz==1.1.0
+torch @ file:///home/qibo/qibotn/wheels/torch-2.10.0a0+a36e1d3-cp312-cp312-linux_x86_64.whl
+tornado==6.5.5
+tqdm==4.67.3
+traitlets==5.14.3
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.46.0
+wcwidth==0.6.0
+webencodings==0.5.1
+zict==3.0.0
+
diff --git a/run_vidal_mps_cases.sh b/run_vidal_mps_cases.sh
index 66db610..93d0268 100755
--- a/run_vidal_mps_cases.sh
+++ b/run_vidal_mps_cases.sh
@@ -20,6 +20,7 @@ MPI_THREADS="${MPI_THREADS:-12}"
 
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 
 run() {
   echo
diff --git a/src/qibotn/backends/cpu.py b/src/qibotn/backends/cpu.py
index 27db0b5..83770a6 100644
--- a/src/qibotn/backends/cpu.py
+++ b/src/qibotn/backends/cpu.py
@@ -420,6 +420,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
         search_time = opts.get("max_time", 60)
         search_backend = opts.get("search_backend")
         dask_address = opts.get("dask_address")
+        dask_expected_workers = opts.get("dask_expected_workers")
         dask_close_workers = bool(opts.get("dask_close_workers", False))
         print_stats = bool(opts.get("print_stats", False))
         debug_trials = bool(opts.get("debug_trials", False))
@@ -502,6 +503,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
                     dask_address=dask_address,
                     debug_trials=debug_trials,
                     dask_close_workers=dask_close_workers,
+                    expected_workers=dask_expected_workers,
                 )
                 search_seconds = time.perf_counter() - search_start
             if tree is None:
diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py
index 46ecc53..0fd577c 100644
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
@@ -356,6 +356,7 @@ def _dask_search(
     optlib=None,
     debug_trials=False,
     close_workers=False,
+    expected_workers=None,
 ):
     """Run one centralized cotengra hyper-optimizer over a dask pool.
 
@@ -403,6 +404,8 @@ def _dask_search(
     retire_workers = []
     try:
         workers, worker_slots = _dask_worker_slots(client)
+        if expected_workers is not None:
+            worker_slots = max(worker_slots, int(expected_workers))
         if close_workers:
             retire_workers = list(workers)
         if debug_trials:
@@ -532,7 +535,7 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
                          max_time=300, n_workers=48, slicing_opts=None,
                          trial_timeout=None, search_backend=None,
                          dask_address=None, debug_trials=False,
-                         dask_close_workers=False):
+                         dask_close_workers=False, expected_workers=None):
     """Parallel contraction path search.
 
     Args:
@@ -576,6 +579,7 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
             n_workers=n_workers,
             debug_trials=debug_trials,
             close_workers=dask_close_workers,
+            expected_workers=expected_workers,
         )
     else:
         raise ValueError(f"Unknown method: {method}")
diff --git a/tools/README.md b/tools/README.md
index db62da6..284a712 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -16,3 +16,4 @@ Files here are intentionally secondary:
 - `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments.
 - `qibojit_reference_expectation.py`: state-vector reference helper.
 - `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper.
+- `mpi_torch_thread_probe.py`: MPI + torch OpenMP affinity and threading probe.
diff --git a/tools/benchmark_qredtea_svd_controls.py b/tools/benchmark_qredtea_svd_controls.py
new file mode 100644
index 0000000..4111c48
--- /dev/null
+++ b/tools/benchmark_qredtea_svd_controls.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+"""Benchmark qredtea/qtealeaves SVD control modes.
+
+This isolates the tensor split used by MPS updates: a rank-2 tensor is split
+with singular values contracted either left or right, then reconstructed to
+measure numerical error and timing.
+"""
+
+from __future__ import annotations
+
+import argparse
+import gc
+import statistics
+import time
+
+import torch
+
+import qmatchatea
+from qredtea.torchapi import QteaTorchTensor
+
+
+def _dtype(name: str):
+    return {
+        "complex64": torch.complex64,
+        "complex128": torch.complex128,
+        "float64": torch.float64,
+        "float32": torch.float32,
+    }[name]
+
+
+def _random_matrix(shape, dtype, seed):
+    gen = torch.Generator(device="cpu")
+    gen.manual_seed(seed)
+    if dtype.is_complex:
+        real_dtype = torch.float32 if dtype == torch.complex64 else torch.float64
+        real = torch.randn(shape, dtype=real_dtype, generator=gen)
+        imag = torch.randn(shape, dtype=real_dtype, generator=gen)
+        return torch.complex(real, imag).to(dtype)
+    return torch.randn(shape, dtype=dtype, generator=gen)
+
+
+def _sync():
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+
+
+def run_one(matrix, ctrl, max_bond, contract_singvals, repeats):
+    conv = qmatchatea.QCConvergenceParameters(
+        max_bond_dimension=max_bond,
+        cut_ratio=0.0,
+        svd_ctrl=ctrl,
+    )
+    qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
+
+    times = []
+    rel_error = None
+    kept = None
+    status = "ok"
+    error = ""
+
+    for i in range(repeats):
+        gc.collect()
+        _sync()
+        t0 = time.perf_counter()
+        try:
+            left, right, singvals, _ = qtensor.split_svd(
+                [0],
+                [1],
+                contract_singvals=contract_singvals,
+                conv_params=conv,
+            )
+        except Exception as exc:  # noqa: BLE001 - benchmark should keep going
+            status = "error"
+            error = repr(exc)
+            break
+        _sync()
+        times.append(time.perf_counter() - t0)
+
+        if i == repeats - 1:
+            left_matrix = left.elem.reshape(matrix.shape[0], -1)
+            right_matrix = right.elem.reshape(-1, matrix.shape[1])
+            recon = left_matrix @ right_matrix
+            rel_error = (
+                torch.linalg.vector_norm(matrix - recon)
+                / torch.linalg.vector_norm(matrix)
+            ).item()
+            kept = int(singvals.numel())
+
+    return {
+        "ctrl": ctrl,
+        "contract_singvals": contract_singvals,
+        "status": status,
+        "median_ms": float("nan") if not times else statistics.median(times) * 1000,
+        "min_ms": float("nan") if not times else min(times) * 1000,
+        "rel_error": rel_error,
+        "kept": kept,
+        "error": error,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--shapes", nargs="+", default=("256x1024", "1024x256", "512x512"))
+    parser.add_argument("--max-bond", type=int, default=128)
+    parser.add_argument("--dtype", choices=("complex64", "complex128", "float32", "float64"), default="complex128")
+    parser.add_argument("--threads", type=int, default=8)
+    parser.add_argument("--repeats", type=int, default=3)
+    parser.add_argument(
+        "--controls",
+        nargs="+",
+        default=("A", "D", "V", "R", "E", "E!", "X", "X!"),
+    )
+    args = parser.parse_args()
+
+    torch.set_num_threads(args.threads)
+    dtype = _dtype(args.dtype)
+
+    print(
+        "svd_benchmark "
+        f"dtype={args.dtype} threads={torch.get_num_threads()} "
+        f"max_bond={args.max_bond} repeats={args.repeats}",
+        flush=True,
+    )
+    print(
+        "columns shape contract ctrl status median_ms min_ms kept rel_error error",
+        flush=True,
+    )
+
+    for shape_text in args.shapes:
+        m_text, n_text = shape_text.lower().split("x", 1)
+        shape = (int(m_text), int(n_text))
+        matrix = _random_matrix(shape, dtype, seed=sum(shape))
+        for contract_singvals in ("L", "R"):
+            for ctrl in args.controls:
+                result = run_one(
+                    matrix,
+                    ctrl=ctrl,
+                    max_bond=args.max_bond,
+                    contract_singvals=contract_singvals,
+                    repeats=args.repeats,
+                )
+                print(
+                    f"row shape={shape_text} "
+                    f"contract={contract_singvals} "
+                    f"ctrl={ctrl} "
+                    f"status={result['status']} "
+                    f"median_ms={result['median_ms']:.3f} "
+                    f"min_ms={result['min_ms']:.3f} "
+                    f"kept={result['kept']} "
+                    f"rel_error={result['rel_error']} "
+                    f"error={result['error']}",
+                    flush=True,
+                )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/manage_tn_dask_cluster.sh b/tools/manage_tn_dask_cluster.sh
index b91cd84..20c4e01 100755
--- a/tools/manage_tn_dask_cluster.sh
+++ b/tools/manage_tn_dask_cluster.sh
@@ -17,10 +17,10 @@ set -euo pipefail
 #   WORKER_HOSTS="10.20.1.103 10.20.6.101"
 #   NWORKERS=48
 #   NTHREADS=1
-#   ROOT_DIR=/home/yx/qibotn
+#   ROOT_DIR=/home/qibo/qibotn
 #   PYTHON_BIN=.venv/bin/python
 
-ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}"
+ROOT_DIR="${ROOT_DIR:-/home/qibo/qibotn}"
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
 SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
diff --git a/tools/mpi_torch_thread_probe.py b/tools/mpi_torch_thread_probe.py
new file mode 100644
index 0000000..7b02104
--- /dev/null
+++ b/tools/mpi_torch_thread_probe.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+"""Probe MPI rank placement and whether torch CPU ops use multiple threads.
+
+Run this under mpirun/mpiexec to check:
+
+* which CPUs each rank is allowed to run on,
+* whether torch sees the requested intra-op thread count, and
+* whether a large CPU tensor op actually consumes more CPU time than wall time.
+
+The script is intentionally small and self-contained so it can be used to debug
+MPI launcher affinity and torch OpenMP behavior independently from the TN code
+path.
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import socket
+import time
+from pathlib import Path
+
+from mpi4py import MPI
+
+
+def _dtype_from_name(name):
+    import torch
+
+    mapping = {
+        "float32": torch.float32,
+        "float64": torch.float64,
+        "complex64": torch.complex64,
+        "complex128": torch.complex128,
+    }
+    return mapping[name]
+
+
+def _make_tensor(shape, dtype):
+    import torch
+
+    if dtype in (torch.complex64, torch.complex128):
+        base = torch.float32 if dtype == torch.complex64 else torch.float64
+        return torch.complex(
+            torch.randn(shape, dtype=base),
+            torch.randn(shape, dtype=base),
+        )
+    return torch.randn(shape, dtype=dtype)
+
+
+def _bench(label, fn, iters, warmup=2):
+    for _ in range(warmup):
+        fn()
+
+    start_wall = time.perf_counter()
+    start_cpu = time.process_time()
+    checksum = 0.0
+    for _ in range(iters):
+        value = fn()
+        checksum += float(value)
+    wall = time.perf_counter() - start_wall
+    cpu = time.process_time() - start_cpu
+    ratio = cpu / wall if wall > 0 else float("inf")
+    print(
+        f"{label} wall={wall:.3f}s cpu={cpu:.3f}s cpu_over_wall={ratio:.2f} "
+        f"checksum={checksum:.6e}",
+        flush=True,
+    )
+
+
+def _visible_numa_nodes():
+    nodes = []
+    for path in sorted(Path("/sys/devices/system/node").glob("node[0-9]*")):
+        cpulist = path / "cpulist"
+        if cpulist.exists():
+            nodes.append(f"{path.name}:{cpulist.read_text(encoding='utf-8').strip()}")
+    return ",".join(nodes) if nodes else "unknown"
+
+
+def _dtype_nbytes(name):
+    return {
+        "float32": 4,
+        "float64": 8,
+        "complex64": 8,
+        "complex128": 16,
+    }[name]
+
+
+def _format_gib(nbytes):
+    return f"{nbytes / (1024 ** 3):.2f}GiB"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--threads", type=int, default=48)
+    parser.add_argument("--n", type=int, default=4096)
+    parser.add_argument("--iters", type=int, default=4)
+    parser.add_argument("--dtype", choices=("float32", "float64", "complex64", "complex128"), default="float32")
+    parser.add_argument("--op", choices=("matmul", "tensordot", "both"), default="both")
+    parser.add_argument(
+        "--affinity-only",
+        action="store_true",
+        help="Print MPI/torch placement diagnostics without allocating tensors.",
+    )
+    args = parser.parse_args()
+
+    os.environ.setdefault("OMP_NUM_THREADS", str(args.threads))
+    os.environ.setdefault("MKL_NUM_THREADS", str(args.threads))
+    os.environ.setdefault("OMP_PROC_BIND", "close")
+    os.environ.setdefault("OMP_PLACES", "cores")
+
+    import torch
+
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    torch.set_num_threads(args.threads)
+    try:
+        torch.set_num_interop_threads(1)
+    except Exception:
+        pass
+
+    dtype = _dtype_from_name(args.dtype)
+    affinity = sorted(os.sched_getaffinity(0))
+    allowed_list = ""
+    try:
+        with open("/proc/self/status", encoding="utf-8") as f:
+            for line in f:
+                if line.startswith("Cpus_allowed_list:"):
+                    allowed_list = line.split(":", 1)[1].strip()
+                    break
+    except OSError:
+        pass
+
+    print(
+        f"rank={rank}/{size} host={socket.gethostname()} pid={os.getpid()} "
+        f"affinity_len={len(affinity)} allowed={allowed_list} "
+        f"torch_threads={torch.get_num_threads()} "
+        f"torch_interop={torch.get_num_interop_threads()} "
+        f"OMP_NUM_THREADS={os.environ.get('OMP_NUM_THREADS')} "
+        f"MKL_NUM_THREADS={os.environ.get('MKL_NUM_THREADS')} "
+        f"OMP_PROC_BIND={os.environ.get('OMP_PROC_BIND')} "
+        f"OMP_PLACES={os.environ.get('OMP_PLACES')} "
+        f"visible_numa={_visible_numa_nodes()}",
+        flush=True,
+    )
+
+    if rank == 0:
+        print(torch.__config__.parallel_info(), flush=True)
+        input_bytes = args.n * args.n * _dtype_nbytes(args.dtype)
+        min_live_bytes = 3 * input_bytes
+        print(
+            f"matrix_n={args.n} dtype={args.dtype} "
+            f"one_matrix={_format_gib(input_bytes)} "
+            f"approx_min_live_per_rank={_format_gib(min_live_bytes)} "
+            f"approx_min_live_all_ranks={_format_gib(min_live_bytes * size)}",
+            flush=True,
+        )
+    comm.Barrier()
+    if args.affinity_only:
+        return
+
+    a = _make_tensor((args.n, args.n), dtype)
+    b = _make_tensor((args.n, args.n), dtype)
+
+    def run_matmul():
+        value = (a @ b).sum()
+        return value.real.item() if value.is_complex() else value.item()
+
+    def run_tensordot():
+        value = torch.tensordot(a, b, dims=1)
+        value = value.sum()
+        return value.real.item() if value.is_complex() else value.item()
+
+    if args.op in ("matmul", "both"):
+        _bench("matmul", run_matmul, args.iters)
+    if args.op in ("tensordot", "both"):
+        _bench("tensordot", run_tensordot, args.iters)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/qibotn_torch_mt_env.sh b/tools/qibotn_torch_mt_env.sh
new file mode 100644
index 0000000..838cdef
--- /dev/null
+++ b/tools/qibotn_torch_mt_env.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+# Shared runtime setup for CPU torch TN/MPS runs.
+#
+# This makes AOCL BLIS use the multithreaded library when available, which is
+# required for complex64 tensordot/cgemm to actually use all cores on this host.
+
+QIBOTN_BLIS_MT="${QIBOTN_BLIS_MT:-/home/aocc/aocl/5.2.0/aocc/lib_LP64/libblis-mt.so.5}"
+
+export BLIS_NUM_THREADS="${BLIS_NUM_THREADS:-${OMP_NUM_THREADS:-1}}"
+
+if [[ -f "$QIBOTN_BLIS_MT" ]]; then
+  case ":${LD_PRELOAD:-}:" in
+    *":$QIBOTN_BLIS_MT:"*)
+      ;;
+    *)
+      export LD_PRELOAD="${LD_PRELOAD:+$LD_PRELOAD:}$QIBOTN_BLIS_MT"
+      ;;
+  esac
+fi
+
+export OMP_PROC_BIND="${OMP_PROC_BIND:-close}"
+export OMP_PLACES="${OMP_PLACES:-cores}"
diff --git a/tools/run_cpu_large_cases.sh b/tools/run_cpu_large_cases.sh
index ba02363..59be311 100755
--- a/tools/run_cpu_large_cases.sh
+++ b/tools/run_cpu_large_cases.sh
@@ -21,6 +21,7 @@ TN_THREADS="${TN_THREADS:-8}"
 
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 
 run_mpi() {
   local ranks="$1"
diff --git a/tools/run_cpu_single_cases.sh b/tools/run_cpu_single_cases.sh
index 720dbc9..b7f23e7 100755
--- a/tools/run_cpu_single_cases.sh
+++ b/tools/run_cpu_single_cases.sh
@@ -22,6 +22,7 @@ TN_THREADS="${TN_THREADS:-12}"
 
 export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}"
 export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}"
+source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 
 estimate_mps_memory() {
   local nqubits="$1"
diff --git a/tools/run_tn_dask_mpi_all.sh b/tools/run_tn_dask_mpi_all.sh
index c273534..b4ba0d1 100755
--- a/tools/run_tn_dask_mpi_all.sh
+++ b/tools/run_tn_dask_mpi_all.sh
@@ -11,25 +11,165 @@ NLAYERS="${NLAYERS:-20}"
 TORCH_THREADS="${TORCH_THREADS:-48}"
 SEARCH_REPEATS="${SEARCH_REPEATS:-2048}"
 SEARCH_TIME="${SEARCH_TIME:-300}"
-TN_TARGET_SIZE="${TN_TARGET_SIZE:-8589934592}"
+TN_TARGET_SIZE="${TN_TARGET_SIZE:-17179869184}"
 TN_TARGET_SLICES="${TN_TARGET_SLICES:-}"
 
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
 DTYPE="${DTYPE:-complex64}"
 TREE_DIR="${TREE_DIR:-trees/contest_tn}"
 DASK_ADDRESS="${DASK_ADDRESS:-tcp://10.20.1.103:8786}"
-MPIEXEC_FULL="${MPIEXEC_FULL:-mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2}"
+DASK_EXPECTED_WORKERS="${DASK_EXPECTED_WORKERS:-}"
+DASK_WAIT_FOR_WORKERS="${DASK_WAIT_FOR_WORKERS:-1}"
+DASK_WAIT_TIMEOUT="${DASK_WAIT_TIMEOUT:-600}"
+TN_DEBUG_TRIALS="${TN_DEBUG_TRIALS:-0}"
+MPIEXEC="${MPIEXEC:-mpirun}"
+MPIEXEC_FULL="${MPIEXEC_FULL:-}"
+MPI_HOSTS="${MPI_HOSTS:-}"
+MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
+MPI_RANKS="${MPI_RANKS:-}"
+MPI_PE="${MPI_PE:-$TORCH_THREADS}"
+MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
+MPI_BIND_TO="${MPI_BIND_TO:-core}"
+MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
+MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
+TN_CONTRACT_ENV_CHECK="${TN_CONTRACT_ENV_CHECK:-1}"
 SYNC_TREES="${SYNC_TREES:-1}"
 SYNC_HOSTS="${SYNC_HOSTS:-${WORKER_HOSTS:-}}"
 SSH_BIN="${SSH_BIN:-ssh}"
+DASK_CLUSTER_MANAGED="${DASK_CLUSTER_MANAGED:-0}"
 
 export TCM_ENABLE="${TCM_ENABLE:-1}"
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
+source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 
 tn_slice_args=(--tn-target-size "$TN_TARGET_SIZE")
 if [[ -n "$TN_TARGET_SLICES" ]]; then
   tn_slice_args+=(--tn-target-slices "$TN_TARGET_SLICES")
 fi
 
+cleanup_dask_cluster() {
+  local status=$?
+  if [[ "$DASK_CLUSTER_MANAGED" == "1" ]]; then
+    set +e
+    tools/manage_tn_dask_cluster.sh stop >/dev/null 2>&1 || true
+  fi
+  exit "$status"
+}
+
+trap cleanup_dask_cluster EXIT INT TERM HUP
+
+sum_host_slots() {
+  local hosts="$1"
+  local total=0
+  local item slots
+  IFS=',' read -r -a host_items <<< "$hosts"
+  for item in "${host_items[@]}"; do
+    if [[ "$item" == *:* ]]; then
+      slots="${item##*:}"
+    else
+      slots=1
+    fi
+    total=$((total + slots))
+  done
+  echo "$total"
+}
+
+count_hosts() {
+  local hosts="$1"
+  local count=0
+  local item
+  IFS=' ' read -r -a host_items <<< "$hosts"
+  for item in "${host_items[@]}"; do
+    [[ -n "$item" ]] && count=$((count + 1))
+  done
+  echo "$count"
+}
+
+wait_for_dask_workers() {
+  [[ "$DASK_WAIT_FOR_WORKERS" == "1" ]] || return 0
+  local expected="$DASK_EXPECTED_WORKERS"
+  if [[ -z "$expected" && -n "$WORKER_HOSTS" ]]; then
+    expected=$(( $(count_hosts "$WORKER_HOSTS") * NWORKERS ))
+  fi
+  if [[ -z "$expected" || "$expected" -le 0 ]]; then
+    return 0
+  fi
+
+  echo "Waiting for Dask workers: expected=$expected timeout=${DASK_WAIT_TIMEOUT}s"
+  "$PYTHON_BIN" - "$DASK_ADDRESS" "$expected" "$DASK_WAIT_TIMEOUT" <<'PY'
+import sys
+import time
+from distributed import Client
+
+address, expected, timeout = sys.argv[1], int(sys.argv[2]), int(sys.argv[3])
+deadline = time.time() + timeout
+client = Client(address)
+try:
+    while True:
+        info = client.scheduler_info(n_workers=-1)
+        workers = info.get("workers", {})
+        count = len(workers)
+        if count >= expected:
+            print(f"dask_workers_ready count={count} expected={expected}", flush=True)
+            break
+        if time.time() >= deadline:
+            print(
+                f"dask_workers_wait_timeout count={count} expected={expected}",
+                flush=True,
+            )
+            break
+        time.sleep(2)
+finally:
+    client.close()
+PY
+}
+
+append_mpi_env_args() {
+  [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
+  mpi_prefix+=(
+    -x "LD_PRELOAD=${LD_PRELOAD:-}"
+    -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
+    -x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
+    -x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
+    -x "OMP_PROC_BIND=$OMP_PROC_BIND"
+    -x "OMP_PLACES=$OMP_PLACES"
+  )
+}
+
+build_mpi_prefix() {
+  if [[ -n "$MPIEXEC_FULL" ]]; then
+    # shellcheck disable=SC2206
+    mpi_prefix=($MPIEXEC_FULL)
+    append_mpi_env_args
+    return
+  fi
+
+  local ranks="$MPI_RANKS"
+  if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
+    ranks="$(sum_host_slots "$MPI_HOSTS")"
+  fi
+  if [[ -z "$ranks" ]]; then
+    ranks=2
+  fi
+
+  mpi_prefix=(
+    "$MPIEXEC"
+    --map-by "$MPI_MAP_BY"
+    --bind-to "$MPI_BIND_TO"
+    -np "$ranks"
+  )
+  if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
+    mpi_prefix+=(--report-bindings)
+  fi
+  append_mpi_env_args
+  if [[ -n "$MPI_HOSTS" ]]; then
+    mpi_prefix+=(-host "$MPI_HOSTS")
+  elif [[ -n "$MPI_HOSTFILE" ]]; then
+    mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
+  fi
+}
+
 is_local_host() {
   local host="$1"
   [[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
@@ -62,25 +202,52 @@ sync_trees_to_hosts() {
 }
 
 tools/manage_tn_dask_cluster.sh start
+DASK_CLUSTER_MANAGED=1
+wait_for_dask_workers
 
 echo "Search with dask: $DASK_ADDRESS"
-"$PYTHON_BIN" -u tools/tn_contest_runner.py search \
-  --case "$CASE" \
-  --nqubits "$NQUBITS" \
-  --nlayers "$NLAYERS" \
-  --observables $OBSERVABLES \
-  --tree-dir "$TREE_DIR" \
-  --dask-address "$DASK_ADDRESS" \
-  --torch-threads "$TORCH_THREADS" \
-  --dtype "$DTYPE" \
-  --tn-search-repeats "$SEARCH_REPEATS" \
-  --tn-search-time "$SEARCH_TIME" \
+search_args=(
+  --case "$CASE"
+  --nqubits "$NQUBITS"
+  --nlayers "$NLAYERS"
+  --observables $OBSERVABLES
+  --tree-dir "$TREE_DIR"
+  --dask-address "$DASK_ADDRESS"
+  --torch-threads "$TORCH_THREADS"
+  --dtype "$DTYPE"
+  --tn-search-repeats "$SEARCH_REPEATS"
+  --tn-search-time "$SEARCH_TIME"
   "${tn_slice_args[@]}"
+)
+if [[ -n "$DASK_EXPECTED_WORKERS" ]]; then
+  search_args+=(--dask-expected-workers "$DASK_EXPECTED_WORKERS")
+fi
+if [[ "$TN_DEBUG_TRIALS" == "1" ]]; then
+  search_args+=(--tn-debug-trials)
+fi
+"$PYTHON_BIN" -u tools/tn_contest_runner.py search "${search_args[@]}"
 
 sync_trees_to_hosts
 
-echo "Contract with MPI: $MPIEXEC_FULL"
-read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
+build_mpi_prefix
+echo "Contract with MPI: ${mpi_prefix[*]}"
+if [[ "$TN_CONTRACT_ENV_CHECK" == "1" ]]; then
+  "${mpi_prefix[@]}" "$PYTHON_BIN" -c "from mpi4py import MPI; import os; \
+import torch; \
+rank = MPI.COMM_WORLD.Get_rank(); \
+blis = []; \
+[blis.append(line.strip().split()[-1]) for line in open('/proc/self/maps') if 'libblis' in line and line.strip().split()[-1] not in blis]; \
+print('tn_contract_env ' + \
+      f'rank={rank} ' + \
+      f'LD_PRELOAD={os.environ.get(\"LD_PRELOAD\", \"\")} ' + \
+      f'BLIS_NUM_THREADS={os.environ.get(\"BLIS_NUM_THREADS\", \"\")} ' + \
+      f'OMP_NUM_THREADS={os.environ.get(\"OMP_NUM_THREADS\", \"\")} ' + \
+      f'MKL_NUM_THREADS={os.environ.get(\"MKL_NUM_THREADS\", \"\")} ' + \
+      f'OMP_PROC_BIND={os.environ.get(\"OMP_PROC_BIND\", \"\")} ' + \
+      f'OMP_PLACES={os.environ.get(\"OMP_PLACES\", \"\")} ' + \
+      f'torch_threads={torch.get_num_threads()} ' + \
+      f'blis={\";\".join(blis) if blis else \"missing\"}', flush=True)"
+fi
 "${mpi_prefix[@]}" "$PYTHON_BIN" -u tools/tn_contest_runner.py contract \
   --mpi \
   --case "$CASE" \
diff --git a/tools/run_vidal_mpi_contest_cases.sh b/tools/run_vidal_mpi_contest_cases.sh
index f2524e7..cee84a4 100755
--- a/tools/run_vidal_mpi_contest_cases.sh
+++ b/tools/run_vidal_mpi_contest_cases.sh
@@ -11,10 +11,15 @@ set -euo pipefail
 #
 # Common overrides:
 #   PYTHON_BIN=.venv/bin/python
-#   MPIEXEC=mpiexec
-#   MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
+#   MPIEXEC=mpirun
+#   MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
+#   MPI_RANKS=8
+#   MPI_PE=128
+#   MPI_MAP_BY=ppr:1:numa:PE=128
+#   MPI_BIND_TO=core
+#   MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
 #   HOSTFILE=hostfile                 # optional; used only if the file exists
-#   RANKS=8
+#   RANKS=8                           # fallback if MPI_RANKS is not set
 #   TORCH_THREADS=8
 #   CUT_RATIO=1e-12
 #   OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
@@ -28,12 +33,23 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 cd "$ROOT_DIR"
 
 PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
-MPIEXEC="${MPIEXEC:-mpiexec}"
-HOSTFILE="${HOSTFILE:-}"
+MPIEXEC="${MPIEXEC:-mpirun}"
+MPIEXEC_FULL="${MPIEXEC_FULL:-}"
+MPI_HOSTS="${MPI_HOSTS:-}"
+MPI_HOSTFILE="${MPI_HOSTFILE:-${HOSTFILE:-}}"
+MPI_RANKS="${MPI_RANKS:-${RANKS:-}}"
 RANKS="${RANKS:-4}"
 TORCH_THREADS="${TORCH_THREADS:-1}"
+MPI_PE="${MPI_PE:-$TORCH_THREADS}"
+MPI_MAP_BY="${MPI_MAP_BY:-ppr:1:numa:PE=$MPI_PE}"
+MPI_BIND_TO="${MPI_BIND_TO:-core}"
+MPI_REPORT_BINDINGS="${MPI_REPORT_BINDINGS:-0}"
+MPI_EXPORT_ENV="${MPI_EXPORT_ENV:-1}"
 CUT_RATIO="${CUT_RATIO:-1e-12}"
 OBS_FILTER="${OBS_FILTER:-}"
+export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$TORCH_THREADS}"
+export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$TORCH_THREADS}"
+source "$ROOT_DIR/tools/qibotn_torch_mt_env.sh"
 
 RUNNER_DIR="$ROOT_DIR/.tmp"
 mkdir -p "$RUNNER_DIR"
@@ -238,15 +254,68 @@ if __name__ == "__main__":
     main()
 PY
 
-if [[ -n "${MPIEXEC_FULL:-}" ]]; then
-  read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
-else
-  mpi_prefix=("$MPIEXEC")
-  if [[ -n "$HOSTFILE" && -f "$HOSTFILE" ]]; then
-    mpi_prefix+=("-hostfile" "$HOSTFILE")
+sum_host_slots() {
+  local hosts="$1"
+  local total=0
+  local item slots
+  IFS=',' read -r -a host_items <<< "$hosts"
+  for item in "${host_items[@]}"; do
+    if [[ "$item" == *:* ]]; then
+      slots="${item##*:}"
+    else
+      slots=1
+    fi
+    total=$((total + slots))
+  done
+  echo "$total"
+}
+
+append_mpi_env_args() {
+  [[ "$MPI_EXPORT_ENV" == "1" ]] || return 0
+  mpi_prefix+=(
+    -x "LD_PRELOAD=${LD_PRELOAD:-}"
+    -x "BLIS_NUM_THREADS=$BLIS_NUM_THREADS"
+    -x "OMP_NUM_THREADS=$OMP_NUM_THREADS"
+    -x "MKL_NUM_THREADS=$MKL_NUM_THREADS"
+    -x "OMP_PROC_BIND=$OMP_PROC_BIND"
+    -x "OMP_PLACES=$OMP_PLACES"
+  )
+}
+
+build_mpi_prefix() {
+  if [[ -n "$MPIEXEC_FULL" ]]; then
+    # shellcheck disable=SC2206
+    mpi_prefix=($MPIEXEC_FULL)
+    append_mpi_env_args
+    return
   fi
-  mpi_prefix+=("-n" "$RANKS")
-fi
+
+  local ranks="$MPI_RANKS"
+  if [[ -z "$ranks" && -n "$MPI_HOSTS" ]]; then
+    ranks="$(sum_host_slots "$MPI_HOSTS")"
+  fi
+  if [[ -z "$ranks" ]]; then
+    ranks="$RANKS"
+  fi
+
+  mpi_prefix=(
+    "$MPIEXEC"
+    --map-by "$MPI_MAP_BY"
+    --bind-to "$MPI_BIND_TO"
+    -np "$ranks"
+  )
+  if [[ "$MPI_REPORT_BINDINGS" == "1" ]]; then
+    mpi_prefix+=(--report-bindings)
+  fi
+  append_mpi_env_args
+  if [[ -n "$MPI_HOSTS" ]]; then
+    mpi_prefix+=(-host "$MPI_HOSTS")
+  elif [[ -n "$MPI_HOSTFILE" ]]; then
+    mpi_prefix+=(-hostfile "$MPI_HOSTFILE")
+  fi
+}
+
+build_mpi_prefix
 
 run_case() {
   local label="$1"
@@ -323,7 +392,12 @@ Cases:
 Common overrides:
   PYTHON_BIN=.venv/bin/python
   MPIEXEC=mpiexec
-  MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
+  MPI_HOSTS="node-1:2,node-2:2,node-3:2,node-0:2"
+  MPI_RANKS=8
+  MPI_PE=128
+  MPI_MAP_BY=ppr:1:numa:PE=128
+  MPI_BIND_TO=core
+  MPIEXEC_FULL="mpirun --map-by ppr:1:numa:PE=128 --bind-to core -np 8 -host node-1:2,node-2:2,node-3:2,node-0:2"
   HOSTFILE=hostfile
   RANKS=8
   TORCH_THREADS=8
diff --git a/tools/tn_contest_runner.py b/tools/tn_contest_runner.py
index 40de960..06ff913 100644
--- a/tools/tn_contest_runner.py
+++ b/tools/tn_contest_runner.py
@@ -47,7 +47,7 @@ CASES = {
     "main1": CaseSpec(
         circuit_kind="rxx_rzz_chain",
         observables=("ring_xz",),
-        nqubits=34,
+        nqubits=37,
         nlayers=20,
         seed=31001,
         target_slices=None,
@@ -205,6 +205,8 @@ def build_parallel_opts(args, tree_file=None, search_only=False):
         opts["search_backend"] = args.tn_search_backend
     if args.dask_address is not None:
         opts["dask_address"] = args.dask_address
+    if args.dask_expected_workers is not None:
+        opts["dask_expected_workers"] = args.dask_expected_workers
     if args.dask_close_workers:
         opts["dask_close_workers"] = True
     if args.tn_debug_trials:
@@ -378,7 +380,7 @@ def main():
     parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
     parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64")
     parser.add_argument("--tn-target-slices", type=int)
-    parser.add_argument("--tn-target-size", type=int, default=2**32)
+    parser.add_argument("--tn-target-size", type=int, default=2**34)
     parser.add_argument("--tn-search-workers", type=int)
     parser.add_argument("--tn-search-repeats", type=int, default=2048)
     parser.add_argument("--tn-search-time", type=float, default=300.0)
@@ -392,6 +394,7 @@ def main():
         ),
     )
     parser.add_argument("--dask-address")
+    parser.add_argument("--dask-expected-workers", type=int)
     parser.add_argument("--dask-close-workers", action="store_true")
     parser.add_argument(
         "--keep-dask",
diff --git a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl
index 55ac205..e41f1f5 100644
Binary files a/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl and b/trees/contest_tn/main1_long_z_string_34q20l_auto.pkl differ
diff --git a/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl b/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl
deleted file mode 100644
index f2180f4..0000000
Binary files a/trees/contest_tn/main1_ring_xz_8q2l_s1.pkl and /dev/null differ
diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl
deleted file mode 100644
index 9c221b7..0000000
Binary files a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_auto.pkl and /dev/null differ
diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl
deleted file mode 100644
index 34f7300..0000000
Binary files a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_repeat192.pkl and /dev/null differ
diff --git a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl b/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl
deleted file mode 100644
index dd1b7a6..0000000
Binary files a/trees/contest_tn/smoke_rxx_rzz_34q20l_xz_timeout_stop.pkl and /dev/null differ
diff --git a/trees/rxx_rzz_30q20l.pkl b/trees/rxx_rzz_30q20l.pkl
deleted file mode 100644
index ea1d3e3..0000000
Binary files a/trees/rxx_rzz_30q20l.pkl and /dev/null differ
diff --git a/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl b/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl
deleted file mode 100644
index cee2158..0000000
Binary files a/trees/rxx_rzz_30q20l_from_existing_s2_check.pkl and /dev/null differ
diff --git a/trees/rxx_rzz_30q20l_from_existing_s4.pkl b/trees/rxx_rzz_30q20l_from_existing_s4.pkl
deleted file mode 100644
index cee2158..0000000
Binary files a/trees/rxx_rzz_30q20l_from_existing_s4.pkl and /dev/null differ
diff --git a/trees/rxx_rzz_34q20l_s4.pkl b/trees/rxx_rzz_34q20l_s4.pkl
deleted file mode 100644
index f09fef0..0000000
Binary files a/trees/rxx_rzz_34q20l_s4.pkl and /dev/null differ