代码封装

修改库
简化代码；加入.venv下内容
2026-05-18 22:58:57 +08:00 · 2026-05-17 23:02:14 +08:00 · 2026-05-18 02:47:40 +08:00 · 2026-05-18 01:37:19 +08:00 · 2026-05-15 11:11:20 +08:00 · 2026-05-15 09:32:26 +08:00
53 changed files with 45932 additions and 711 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,9 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+data/
 # C extensions
 *.so
-
 # Distribution / packaging
 .Python
 build/
@@ -160,3 +159,13 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 .devenv
+
+
+# yx
+bak/
+path/
+profiles/
+vtune_expval/
+perf*
+experiments/
+references/
--- a/.venv/lib/python3.12/site-packages/cotengra/contract.py
+++ b/.venv/lib/python3.12/site-packages/cotengra/contract.py
--- a/.venv/lib/python3.12/site-packages/cotengra/core.py
+++ b/.venv/lib/python3.12/site-packages/cotengra/core.py
--- a/.venv/lib/python3.12/site-packages/cotengra/hyperoptimizers/hyper.py
+++ b/.venv/lib/python3.12/site-packages/cotengra/hyperoptimizers/hyper.py
--- a/.venv/lib/python3.12/site-packages/cotengra/parallel.py
+++ b/.venv/lib/python3.12/site-packages/cotengra/parallel.py
@@ -0,0 +1,583 @@
+"""Interface for parallelism."""
+
+import atexit
+import collections
+import functools
+import importlib
+import inspect
+import numbers
+import operator
+import warnings
+
+_AUTO_BACKEND = None
+
+# check for loky, joblib (vendors loky), then default to concurrent.futures
+have_loky = importlib.util.find_spec("loky") is not None
+have_joblib = importlib.util.find_spec("joblib") is not None
+if have_loky or have_joblib:
+    _DEFAULT_BACKEND = "loky"
+else:
+    _DEFAULT_BACKEND = "concurrent.futures"
+
+
+@functools.lru_cache(None)
+def choose_default_num_workers():
+    import os
+
+    if "COTENGRA_NUM_WORKERS" in os.environ:
+        return int(os.environ["COTENGRA_NUM_WORKERS"])
+
+    if "OMP_NUM_THREADS" in os.environ:
+        return int(os.environ["OMP_NUM_THREADS"])
+
+    return os.cpu_count()
+
+
+def get_pool(n_workers=None, maybe_create=False, backend=None):
+    """Get a parallel pool."""
+    if backend is None:
+        backend = _DEFAULT_BACKEND
+
+    if backend == "dask":
+        return _get_pool_dask(n_workers=n_workers, maybe_create=maybe_create)
+
+    if backend == "ray":
+        return _get_pool_ray(n_workers=n_workers, maybe_create=maybe_create)
+
+    # above backends are distributed, don't specify n_workers
+    if n_workers is None:
+        n_workers = choose_default_num_workers()
+
+    if backend == "loky":
+        get_reusable_executor = get_loky_get_reusable_executor()
+        return get_reusable_executor(max_workers=n_workers)
+
+    if backend == "concurrent.futures":
+        return _get_process_pool_cf(n_workers=n_workers)
+
+    if backend == "threads":
+        return _get_thread_pool_cf(n_workers=n_workers)
+
+
+@functools.lru_cache(None)
+def _infer_backed_cached(pool_class):
+    if pool_class.__name__ == "RayExecutor":
+        return "ray"
+
+    path = pool_class.__module__.split(".")
+
+    if path[0] == "concurrent":
+        return "concurrent.futures"
+
+    if path[0] == "joblib":
+        return "loky"
+
+    if path[0] == "distributed":
+        return "dask"
+
+    return path[0]
+
+
+def _infer_backend(pool):
+    """Return the backend type of ``pool`` - cached for speed."""
+    return _infer_backed_cached(pool.__class__)
+
+
+def get_n_workers(pool=None):
+    """Extract how many workers our pool has (mostly for working out how many
+    tasks to pre-dispatch).
+    """
+    if pool is None:
+        pool = get_pool()
+
+    try:
+        return pool._max_workers
+    except AttributeError:
+        pass
+
+    backend = _infer_backend(pool)
+
+    if backend == "dask":
+        workers = pool.scheduler_info(n_workers=-1)["workers"]
+        return sum(int(w.get("nthreads", 1) or 1) for w in workers.values())
+
+    if backend == "ray":
+        while True:
+            try:
+                return int(get_ray().available_resources()["CPU"])
+            except KeyError:
+                import time
+
+                time.sleep(1e-3)
+
+    if backend == "mpi4py":
+        from mpi4py import MPI
+
+        return MPI.COMM_WORLD.size
+
+    raise ValueError(f"Can't find number of workers in pool {pool}.")
+
+
+def parse_parallel_arg(parallel):
+    """ """
+    global _AUTO_BACKEND
+
+    if parallel == "auto":
+        return get_pool(maybe_create=False, backend=_AUTO_BACKEND)
+
+    if parallel is False:
+        return None
+
+    if parallel is True:
+        if _AUTO_BACKEND is None:
+            _AUTO_BACKEND = _DEFAULT_BACKEND
+        parallel = _AUTO_BACKEND
+
+    if isinstance(parallel, numbers.Integral):
+        _AUTO_BACKEND = _DEFAULT_BACKEND
+        return get_pool(
+            n_workers=parallel, maybe_create=True, backend=_DEFAULT_BACKEND
+        )
+
+    if parallel == "loky":
+        return get_pool(maybe_create=True, backend="loky")
+
+    if parallel == "concurrent.futures":
+        return get_pool(maybe_create=True, backend="concurrent.futures")
+
+    if parallel == "threads":
+        return get_pool(maybe_create=True, backend="threads")
+
+    if parallel == "dask":
+        _AUTO_BACKEND = "dask"
+        return get_pool(maybe_create=True, backend="dask")
+
+    if parallel == "ray":
+        _AUTO_BACKEND = "ray"
+        return get_pool(maybe_create=True, backend="ray")
+
+    return parallel
+
+
+def set_parallel_backend(backend):
+    """Create a parallel pool of type ``backend`` which registers it as the
+    default for ``'auto'`` parallel.
+    """
+    return parse_parallel_arg(backend)
+
+
+def maybe_leave_pool(pool):
+    """Logic required for nested parallelism in dask.distributed."""
+    if _infer_backend(pool) == "dask":
+        return _maybe_leave_pool_dask()
+
+
+def maybe_rejoin_pool(is_worker, pool):
+    """Logic required for nested parallelism in dask.distributed."""
+    if is_worker and _infer_backend(pool) == "dask":
+        _rejoin_pool_dask()
+
+
+def submit(pool, fn, *args, **kwargs):
+    """Interface for submitting ``fn(*args, **kwargs)`` to ``pool``."""
+    if _infer_backend(pool) == "dask":
+        kwargs.setdefault("pure", False)
+    return pool.submit(fn, *args, **kwargs)
+
+
+def scatter(pool, data):
+    """Interface for maybe turning ``data`` into a remote object or reference."""
+    if _infer_backend(pool) in ("dask", "ray"):
+        return pool.scatter(data)
+    return data
+
+
+def can_scatter(pool):
+    """Whether ``pool`` can make objects remote."""
+    return _infer_backend(pool) in ("dask", "ray")
+
+
+def should_nest(pool):
+    """Given argument ``pool`` should we try nested parallelism."""
+    if pool is None:
+        return False
+    backend = _infer_backend(pool)
+    if backend in ("ray", "dask"):
+        return backend
+    return False
+
+
+# ---------------------------------- loky ----------------------------------- #
+
+
+@functools.lru_cache(1)
+def get_loky_get_reusable_executor():
+    try:
+        from loky import get_reusable_executor
+    except ImportError:
+        from joblib.externals.loky import get_reusable_executor
+    return get_reusable_executor
+
+
+# --------------------------- concurrent.futures ---------------------------- #
+
+
+class CachedProcessPoolExecutor:
+    def __init__(self):
+        self._pool = None
+        self._n_workers = -1
+        atexit.register(self.shutdown)
+
+    def __call__(self, n_workers=None):
+        if n_workers != self._n_workers:
+            from concurrent.futures import ProcessPoolExecutor
+
+            self.shutdown()
+            self._pool = ProcessPoolExecutor(n_workers)
+            self._n_workers = n_workers
+        return self._pool
+
+    def is_initialized(self):
+        return self._pool is not None
+
+    def shutdown(self):
+        if self._pool is not None:
+            self._pool.shutdown()
+            self._pool = None
+
+    def __del__(self):
+        self.shutdown()
+
+
+ProcessPoolHandler = CachedProcessPoolExecutor()
+
+
+def _get_process_pool_cf(n_workers=None):
+    return ProcessPoolHandler(n_workers)
+
+
+class CachedThreadPoolExecutor:
+    def __init__(self):
+        self._pool = None
+        self._n_workers = -1
+        atexit.register(self.shutdown)
+
+    def __call__(self, n_workers=None):
+        if n_workers != self._n_workers:
+            from concurrent.futures import ThreadPoolExecutor
+
+            self.shutdown()
+            self._pool = ThreadPoolExecutor(n_workers)
+            self._n_workers = n_workers
+        return self._pool
+
+    def is_initialized(self):
+        return self._pool is not None
+
+    def shutdown(self):
+        if self._pool is not None:
+            self._pool.shutdown()
+            self._pool = None
+
+    def __del__(self):
+        self.shutdown()
+
+
+ThreadPoolHandler = CachedThreadPoolExecutor()
+
+
+def _get_thread_pool_cf(n_workers=None):
+    return ThreadPoolHandler(n_workers)
+
+
+# ---------------------------------- DASK ----------------------------------- #
+
+
+def _get_pool_dask(n_workers=None, maybe_create=False):
+    """Maybe get an existing or create a new dask.distrbuted client.
+
+    Parameters
+    ----------
+    n_workers : None or int, optional
+        The number of workers to request if creating a new client.
+    maybe_create : bool, optional
+        Whether to create an new local cluster and client if no existing client
+        is found.
+
+    Returns
+    -------
+    None or dask.distributed.Client
+    """
+    try:
+        from dask.distributed import get_client
+    except ImportError:
+        if not maybe_create:
+            return None
+        else:
+            raise
+
+    try:
+        client = get_client()
+    except ValueError:
+        if not maybe_create:
+            return None
+
+        import shutil
+        import tempfile
+
+        from dask.distributed import Client, LocalCluster
+
+        local_directory = tempfile.mkdtemp()
+        lc = LocalCluster(
+            n_workers=n_workers,
+            threads_per_worker=1,
+            local_directory=local_directory,
+            memory_limit=0,
+        )
+        client = Client(lc)
+
+        warnings.warn(
+            "Parallel specified but no existing global dask client found... "
+            "created one (with {} workers).".format(get_n_workers(client))
+        )
+
+        @atexit.register
+        def delete_local_dask_directory():
+            shutil.rmtree(local_directory, ignore_errors=True)
+
+    if n_workers is not None:
+        current_n_workers = get_n_workers(client)
+        if n_workers != current_n_workers:
+            warnings.warn(
+                "Found existing client (with {} workers which) doesn't match "
+                "the requested {}... using it instead.".format(
+                    current_n_workers, n_workers
+                )
+            )
+
+    return client
+
+
+def _maybe_leave_pool_dask():
+    try:
+        from dask.distributed import secede
+
+        secede()  # for nested parallelism
+        is_dask_worker = True
+    except (ImportError, ValueError):
+        is_dask_worker = False
+    return is_dask_worker
+
+
+def _rejoin_pool_dask():
+    from dask.distributed import rejoin
+
+    rejoin()
+
+
+# ----------------------------------- RAY ----------------------------------- #
+
+
+@functools.lru_cache(None)
+def get_ray():
+    """ """
+    import ray
+
+    return ray
+
+
+class RayFuture:
+    """Basic ``concurrent.futures`` like future wrapping a ray ``ObjectRef``."""
+
+    __slots__ = ("_obj", "_cancelled")
+
+    def __init__(self, obj):
+        self._obj = obj
+        self._cancelled = False
+
+    def result(self, timeout=None):
+        return get_ray().get(self._obj, timeout=timeout)
+
+    def done(self):
+        return self._cancelled or bool(
+            get_ray().wait([self._obj], timeout=0)[0]
+        )
+
+    def cancel(self):
+        get_ray().cancel(self._obj)
+        self._cancelled = True
+
+
+def _unpack_futures_tuple(x):
+    return tuple(map(_unpack_futures, x))
+
+
+def _unpack_futures_list(x):
+    return list(map(_unpack_futures, x))
+
+
+def _unpack_futures_dict(x):
+    return {k: _unpack_futures(v) for k, v in x.items()}
+
+
+def _unpack_futures_identity(x):
+    return x
+
+
+_unpack_dispatch = collections.defaultdict(
+    lambda: _unpack_futures_identity,
+    {
+        RayFuture: operator.attrgetter("_obj"),
+        tuple: _unpack_futures_tuple,
+        list: _unpack_futures_list,
+        dict: _unpack_futures_dict,
+    },
+)
+
+
+def _unpack_futures(x):
+    """Allows passing futures by reference - takes e.g. args and kwargs and
+    replaces all ``RayFuture`` objects with their underyling ``ObjectRef``
+    within all nested tuples, lists and dicts.
+
+    [Subclassing ``ObjectRef`` might avoid needing this.]
+    """
+    return _unpack_dispatch[x.__class__](x)
+
+
+@functools.lru_cache(2**14)
+def get_remote_fn(fn, **remote_opts):
+    """Cached retrieval of remote function."""
+    ray = get_ray()
+    if remote_opts:
+        return ray.remote(**remote_opts)(fn)
+    return ray.remote(fn)
+
+
+@functools.lru_cache(2**14)
+def get_fn_as_remote_object(fn):
+    ray = get_ray()
+    return ray.put(fn)
+
+
+@functools.lru_cache(None)
+def get_deploy(**remote_opts):
+    """Alternative for 'non-function' callables - e.g. partial
+    functions - pass the callable object too.
+    """
+    ray = get_ray()
+
+    def deploy(fn, *args, **kwargs):
+        return fn(*args, **kwargs)
+
+    if remote_opts:
+        return ray.remote(**remote_opts)(deploy)
+    return ray.remote(deploy)
+
+
+class RayExecutor:
+    """Basic ``concurrent.futures`` like interface using ``ray``."""
+
+    def __init__(self, *args, default_remote_opts=None, **kwargs):
+        ray = get_ray()
+        if not ray.is_initialized():
+            ray.init(*args, **kwargs)
+
+        self.default_remote_opts = (
+            {} if default_remote_opts is None else dict(default_remote_opts)
+        )
+
+    def _maybe_inject_remote_opts(self, remote_opts=None):
+        """Return the default remote options, possibly overriding some with
+        those supplied by a ``submit call``.
+        """
+        ropts = self.default_remote_opts
+        if remote_opts is not None:
+            ropts = {**ropts, **remote_opts}
+        return ropts
+
+    def submit(self, fn, *args, pure=False, remote_opts=None, **kwargs):
+        """Remotely run ``fn(*args, **kwargs)``, returning a ``RayFuture``."""
+        # want to pass futures by reference
+        args = _unpack_futures_tuple(args)
+        kwargs = _unpack_futures_dict(kwargs)
+
+        ropts = self._maybe_inject_remote_opts(remote_opts)
+
+        # this is the same test ray uses to accept functions
+        if inspect.isfunction(fn):
+            # can use the faster cached remote function
+            obj = get_remote_fn(fn, **ropts).remote(*args, **kwargs)
+        else:
+            fn_obj = get_fn_as_remote_object(fn)
+            obj = get_deploy(**ropts).remote(fn_obj, *args, **kwargs)
+
+        return RayFuture(obj)
+
+    def map(self, func, *iterables, remote_opts=None):
+        """Remote map ``func`` over arguments ``iterables``."""
+        ropts = self._maybe_inject_remote_opts(remote_opts)
+        remote_fn = get_remote_fn(func, **ropts)
+        objs = tuple(map(remote_fn.remote, *iterables))
+        ray = get_ray()
+        return map(ray.get, objs)
+
+    def scatter(self, data):
+        """Push ``data`` into the distributed store, returning an ``ObjectRef``
+        that can be supplied to ``submit`` calls for example.
+        """
+        ray = get_ray()
+        return ray.put(data)
+
+    def shutdown(self):
+        """Shutdown the parent ray cluster, this ``RayExecutor`` instance
+        itself does not need any cleanup.
+        """
+        get_ray().shutdown()
+
+
+_RAY_EXECUTOR = None
+
+
+def _get_pool_ray(n_workers=None, maybe_create=False):
+    """Maybe get an existing or create a new RayExecutor, thus initializing,
+    ray.
+
+    Parameters
+    ----------
+    n_workers : None or int, optional
+        The number of workers to request if creating a new client.
+    maybe_create : bool, optional
+        Whether to create initialize ray and return a RayExecutor if not
+        initialized already.
+
+    Returns
+    -------
+    None or RayExecutor
+    """
+    try:
+        import ray
+    except ImportError:
+        if not maybe_create:
+            return None
+        else:
+            raise
+
+    global _RAY_EXECUTOR
+
+    if (_RAY_EXECUTOR is None) or (not ray.is_initialized()):
+        if not maybe_create:
+            return None
+        _RAY_EXECUTOR = RayExecutor(num_cpus=n_workers)
+
+    if n_workers is not None:
+        current_n_workers = get_n_workers(_RAY_EXECUTOR)
+        if n_workers != current_n_workers:
+            warnings.warn(
+                "Found initialized ray (with {} workers which) doesn't match "
+                "the requested {}... sticking with old number.".format(
+                    current_n_workers, n_workers
+                )
+            )
+
+    return _RAY_EXECUTOR
--- a/.venv/lib/python3.12/site-packages/qmatchatea/py_emulator.py
+++ b/.venv/lib/python3.12/site-packages/qmatchatea/py_emulator.py
--- a/.venv/lib/python3.12/site-packages/qredtea/torchapi/qteatorchtensor.py
+++ b/.venv/lib/python3.12/site-packages/qredtea/torchapi/qteatorchtensor.py
--- a/.venv/lib/python3.12/site-packages/qtealeaves/emulator/mpi_mps_simulator.py
+++ b/.venv/lib/python3.12/site-packages/qtealeaves/emulator/mpi_mps_simulator.py
@@ -0,0 +1,691 @@
+# This code is part of qtealeaves.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""
+The module contains a the MPI version of the MPS simulator.
+
+Code for the MPI simulations should be run as:
+
+.. code-block::
+   mpiexec -n 4 python my_mpi_script.py
+
+where we used 4 processes as an example.
+"""
+import os
+
+import numpy as np
+
+from qtealeaves.convergence_parameters import TNConvergenceParameters
+from qtealeaves.tensors import TensorBackend
+from qtealeaves.tooling.mpisupport import MPI, TN_MPI_TYPES
+
+from .mps_simulator import MPS
+
+__all__ = ["MPIMPS"]
+
+
+def _mpi_array_dtype(array):
+    """Return the MPI dtype for numpy arrays and CPU tensor buffers."""
+    dtype = array.dtype
+    if hasattr(dtype, "str"):
+        return TN_MPI_TYPES[dtype.str]
+
+    # qredtea torch singular values are raw torch.Tensor objects, not
+    # QteaTorchTensor instances, so they do not expose dtype_mpi().
+    import torch
+
+    return {
+        torch.complex128: MPI.DOUBLE_COMPLEX,
+        torch.complex64: MPI.COMPLEX,
+        torch.float64: MPI.DOUBLE_PRECISION,
+        torch.float32: MPI.REAL,
+        torch.int64: MPI.INT,
+    }[dtype]
+
+
+def _mpi_send_array(comm, array, to_):
+    if hasattr(array, "resolve_conj"):
+        array = array.resolve_conj().contiguous()
+    comm.Send([array, _mpi_array_dtype(array)], to_)
+
+
+def _mpi_empty_like(array, shape):
+    if hasattr(array, "resolve_conj"):
+        import torch
+
+        return torch.empty(shape, dtype=array.dtype, device="cpu")
+    return np.empty(shape, array.dtype)
+
+
+def _mpi_recv_array(comm, template, shape, from_):
+    array = _mpi_empty_like(template, shape)
+    comm.Recv([array, _mpi_array_dtype(array)], from_)
+    if hasattr(template, "device") and hasattr(array, "to"):
+        array = array.to(device=template.device)
+    return array
+
+
+# pylint: disable-next=too-many-instance-attributes
+class MPIMPS(MPS):
+    """
+    MPI version of the MPS emulator that divides the MPS between the different nodes
+
+    Parameters
+    ----------
+    num_sites: int
+        Number of sites
+    convergence_parameters: :py:class:`TNConvergenceParameters`
+        Class for handling convergence parameters. In particular, in the MPS simulator we are
+        interested in:
+        - the *maximum bond dimension* :math:`\\chi`;
+        - the *cut ratio* :math:`\\epsilon` after which the singular
+            values are neglected, i.e. if :math:`\\lamda_1` is the
+            bigger singular values then after an SVD we neglect all the
+            singular values such that :math:`\\frac{\\lambda_i}{\\lambda_1}\\leq\\epsilon`
+    local_dim: int or list of ints, optional
+        Local dimension of the degrees of freedom. Default to 2.
+        If a list is given, then it must have length num_sites.
+    initialize: str, optional
+        The method for the initialization. Default to "vacuum"
+        Available:
+        - "vacuum", for the |000...0> state
+        - "random", for a random state at given bond dimension
+    tensor_backend : `None` or instance of :class:`TensorBackend`
+        Default for `None` is :class:`QteaTensor` with np.complex128 on CPU.
+
+    """
+
+    # pylint: disable-next=too-many-arguments
+    def __init__(
+        self,
+        num_sites,
+        convergence_parameters,
+        local_dim=2,
+        initialize="vacuum",
+        tensor_backend=None,
+    ):
+        if MPI is None:
+            raise ImportError("No module mpi4py found in python environment")
+        # MPI variables
+        # pylint: disable-next=c-extension-no-member
+        self.comm = MPI.COMM_WORLD
+        self.size = self.comm.Get_size()
+        self.rank = self.comm.Get_rank()
+        self.tot_sites = num_sites
+
+        # Number of sites in the local MPS
+        modulus = num_sites % self.size
+        local_num_size = int(np.floor(num_sites // self.size))
+        self.indexes = [0] + [
+            local_num_size + 1 if ii < modulus else local_num_size
+            for ii in range(self.size)
+        ]
+        local_num_size = self.indexes[self.rank + 1]
+
+        # indexes takes into account which indexes are in each core
+        self.indexes = np.cumsum(self.indexes)
+
+        # The par_map is a dicrionary where the index is the position of the
+        # sites in the full chain, while the value the position on the
+        # subchain in this process
+        self.par_map = dict(
+            zip(
+                np.arange(
+                    self.indexes[self.rank], self.indexes[self.rank + 1], dtype=int
+                ),
+                np.arange(local_num_size, dtype=int),
+            )
+        )
+
+        # Auxiliary site for the boundaries
+        if self.rank < self.size - 1:
+            local_num_size += 1
+
+        if not np.isscalar(local_dim):
+            local_dim = local_dim[
+                self.indexes[self.rank] : self.indexes[self.rank + 1]
+                + int(self.rank != (self.size - 1))
+            ]
+
+        super().__init__(
+            local_num_size,
+            convergence_parameters,
+            local_dim=local_dim,
+            initialize=initialize,
+            tensor_backend=tensor_backend,
+        )
+
+        # MPS initializetion not aware of device
+        self.convert(self.tensor_backend.dtype, self.tensor_backend.memory_device)
+
+    @property
+    def mpi_dtype(self):
+        """Return the MPI version of the MPS dtype (going via first tensor)"""
+        return TN_MPI_TYPES[np.dtype(self[0].dtype).str]
+
+    def get_tensor_of_site(self, idx):
+        """Retrieve tensor of specifc site."""
+        return self[self.par_map[idx]]
+
+    def apply_one_site_operator(self, op, pos):
+        """
+        Applies a one operator `op` to the site `pos` of the MPIMPS.
+        Instead of communicating the changes on the boundaries we
+        perform an additional contraction.
+
+        Parameters
+        ----------
+        op: numpy array shape (local_dim, local_dim)
+            Matrix representation of the quantum gate
+        pos: int
+            Position of the qubit where to apply `op`.
+        """
+        # Apply the gate on the right MPS
+        if pos in self.par_map:
+            super().apply_one_site_operator(op, self.par_map[pos])
+
+        # For one-qubit gates it is more convenient to apply them both to
+        # the real and auxiliary qubits if they are on the boundaries
+        elif pos - 1 in self.par_map:
+            super().apply_one_site_operator(op, self.num_sites - 1)
+
+        return None
+
+    # pylint: disable-next=too-many-arguments
+    def apply_two_site_operator(self, op, pos, swap=False, svd=None, parallel=None):
+        """
+        Applies a two-site operator `op` to the site `pos`, `pos+1` of the MPS.
+        Then, perform the necessary communications between the interested
+        process and the process
+
+        Parameters
+        ----------
+        op: numpy array shape (local_dim, local_dim, local_dim, local_dim)
+            Matrix representation of the quantum gate
+        pos: int or list of ints
+            Position of the qubit where to apply `op`. If a list is passed,
+            the two sites should be adjacent. The first index is assumed to
+            be the control, and the second the target. The swap argument is
+            overwritten if a list is passed.
+        swap: bool
+            If True swaps the operator. This means that instead of the
+            first contraction in the following we get the second.
+            It is written is a list of pos is passed.
+        svd : None
+            Required for compatibility. Can be only True.
+        parallel: None
+            Required for compatibility. Can be only True
+
+        Returns
+        -------
+        singular_values_cutted: ndarray
+            Array of singular values cutted, normalized to the biggest singular value
+
+        """
+        if not np.isscalar(pos) and len(pos) == 2:
+            pos = min(pos[0], pos[1])
+        elif not np.isscalar(pos):
+            raise ValueError(
+                f"pos should be only scalar or len 2 array-like, not len {len(pos)}"
+            )
+
+        # Hardcoded but necessary for compatibility
+        svd = True
+        if parallel is None:
+            parallel_env = os.environ.get("QTEALEAVES_MPIMPS_PARALLEL", "1").lower()
+            parallel = parallel_env not in ("0", "false", "no", "off")
+
+        if pos in self.par_map:
+            res = super().apply_two_site_operator(
+                op, self.par_map[pos], swap, svd=svd, parallel=parallel
+            )
+
+            # Send the information back to the auxiliary if it was the first site
+            if self.par_map[pos] == 0 and self.rank > 0:
+                self.mpi_send_tensor(self[0], to_=self.rank - 1)
+                _mpi_send_array(self.comm, self.singvals[1], self.rank - 1)
+
+            # Send the information towards the next if it was the last site
+            elif self.par_map[pos] == self.num_sites - 2 and self.rank < self.size - 1:
+                self.mpi_send_tensor(self[self.num_sites - 1], to_=self.rank + 1)
+                _mpi_send_array(
+                    self.comm, self.singvals[self.num_sites - 1], self.rank + 1
+                )
+
+        else:
+            res = []
+            # Receive the information from the MPS on the right
+            if pos == self.indexes[self.rank + 1] and self.rank < self.size - 1:
+                tens = self.mpi_receive_tensor(from_=self.rank + 1)
+
+                self[self.num_sites - 1] = tens
+
+                singvals = _mpi_recv_array(
+                    self.comm,
+                    self.singvals[self.num_sites],
+                    tens.shape[2],
+                    self.rank + 1,
+                )
+                self._singvals[self.num_sites] = singvals
+
+            # Receive the information from the MPS from the left
+            if pos == self.indexes[self.rank] - 1 and self.rank > 0:
+                tens = self.mpi_receive_tensor(from_=self.rank - 1)
+                self[0] = tens
+
+                singvals = _mpi_recv_array(
+                    self.comm,
+                    self.singvals[0],
+                    tens.shape[0],
+                    self.rank - 1,
+                )
+                self._singvals[0] = singvals
+
+        return res
+
+    def apply_projective_operator(self, site, selected_output=None, remove=False):
+        """
+        Apply a projective operator to the site **site**, and give the measurement as output.
+        You can also decide to select a given output for the measurement, if the probability is
+        non-zero. Finally, you have the possibility of removing the site after the measurement.
+
+        Parameters
+        ----------
+        site: int
+            Index of the site you want to measure
+        selected_output: int, optional
+            If provided, the selected state is measured. Throw an error if the probability of the
+            state is 0
+        remove: bool, optional
+            If True, the measured index is traced away after the measurement. Default to False.
+
+        Returns
+        -------
+        meas_state: int | None
+            Measured state or None if site not in this part of the MPI-MPS.
+        state_prob : float | None
+            Probability of measuring the output state or None if site not
+            in this part of the MPI-MPS.
+        """
+        self.reinstall_isometry_serial()
+        if site in self.par_map:
+            res = super().apply_projective_operator(
+                self.par_map[site], selected_output, remove
+            )
+        else:
+            res = (None, None)
+
+        # Move informations to further right
+        self.reinstall_isometry_serial(left=False, from_site=site)
+        # Move information to the left
+        self.reinstall_isometry_serial()
+
+        return res
+
+    # pylint: disable-next=arguments-differ
+    def reinstall_isometry_serial(self, left=False, from_site=None):
+        """
+        Reinstall the isometry center on position 0 of the full MPS.
+
+        This step is serial because we have to serially pass the information
+        along the MPS. It cannot be parallelized.
+
+        Parameters
+        ----------
+        left: bool, optional
+            If True, reinstall the isometry to the left.
+            If False, to the right. Defaulto to False
+        from_site: int, optional
+            The site from which the isometrization should start.
+            By default None, i.e. the other end of the MPS chain.
+
+        Returns
+        -------
+        None
+        """
+        if from_site is None:
+            from_site = self.num_sites - 1 if left else 0
+        extrem = np.nonzero(from_site <= self.indexes)[0][0]
+
+        if left:
+            boundaries = (extrem, -1, -1)
+            tidx = 0
+            to_ = self.rank - 1
+            from_ = self.rank + 1
+        else:
+            boundaries = (extrem, self.size, 1)
+            tidx = self.num_sites - 1
+            to_ = self.rank + 1
+            from_ = self.rank - 1
+
+        for ii in range(*boundaries):
+            if self.rank == ii:
+                self._first_non_orthogonal_left = self.num_sites - 1
+                self._first_non_orthogonal_right = self.num_sites - 1
+                requires_singvals = self._requires_singvals
+                self._requires_singvals = True
+                if left:
+                    self.right_canonize(0, False, True)
+                else:
+                    self.left_canonize(self.num_sites - 1, False, True)
+                self._requires_singvals = requires_singvals
+
+                # Send tensor
+                if (self.rank > 0 and left) or (self.rank + 1 < self.size and not left):
+                    self.mpi_send_tensor(self[tidx], to_=to_)
+
+            elif (self.rank == ii - 1 and left) or (self.rank == ii + 1 and not left):
+                # Receive tensor
+                tens = self.mpi_receive_tensor(from_=from_)
+                self[self.num_sites - 1 - tidx] = tens
+
+    # pylint: disable-next=arguments-differ
+    def reinstall_isometry_parallel(self, num_cycles):
+        """
+        Reinstall the isometry by applying identities to all even sites and
+        to all odd sites, and repeating for `num_cycles` cycles.
+        The reinstallation is exact for `num_cycles=num_sites/2`.
+        Method from https://arxiv.org/abs/2312.02667
+
+        This step is serial because we have to serially pass the information
+        along the MPS. It cannot be parallelized.
+
+        Parameters
+        ----------
+        num_cycles: int
+            Number of cycles for reinstalling the isometry
+
+        Returns
+        -------
+        None
+        """
+        for _ in range(num_cycles):
+            # Apply on all even sites
+            for ii in range(0, self.tot_sites - 1, 2):
+                self.apply_two_site_operator(
+                    self[0].eye_like(4), ii, svd=True, parallel=True
+                )
+            # Apply on all odd sites
+            for ii in range(1, self.tot_sites - 1, 2):
+                self.apply_two_site_operator(
+                    self[0].eye_like(4), ii, svd=True, parallel=True
+                )
+
+    def mpi_gather_tn(self):
+        """
+        Gather the tensors on process 0.
+        We do not use MPI.comm.Gather because we would gather lists of np.arrays
+        without using the np.array advantages, making it slower than the single
+        communications.
+
+        Returns
+        -------
+        list on np.ndarray or None
+            List of tensors on the rank 0 process, None on the others
+        """
+        self.comm.Barrier()
+        if self.rank != 0:
+            num_tensors = (
+                self.num_sites if self.rank == self.size - 1 else self.num_sites - 1
+            )
+            for jj in range(num_tensors):
+                self.mpi_send_tensor(self[jj], to_=0)
+            tensor_list = None
+        else:
+            tensor_list = [None for _ in range(self.tot_sites)]
+            tensor_list[: self.num_sites - 1] = self.tensors[:-1]
+
+            tidx = self.num_sites - 1
+            for ii in range(1, self.size):
+                num_tensors = self.indexes[ii + 1] - self.indexes[ii]
+                for jj in range(num_tensors):
+                    tens = self.mpi_receive_tensor(from_=ii)
+                    tensor_list[tidx + jj] = tens
+                tidx += num_tensors
+
+        self.comm.Barrier()
+
+        return tensor_list
+
+    def mpi_scatter_tn(self, tensor_list):
+        """
+        Scatter the tensors on process 0.
+        We do not use MPI.comm.Scatter because we would gather lists of np.arrays
+        without using the np.array advantages, making it slower than the single
+        communications.
+
+        Parameters
+        ----------
+        tensor_list : list of lists of np.ndarrays
+            The index i of the list is sent to the rank i
+
+        Returns
+        -------
+        list on np.ndarray or None
+            List of tensors on the rank 0 process, None on the others
+        """
+        self.comm.Barrier()
+        if self.rank == 0:
+            for ridx, sub_tensorlist in enumerate(tensor_list[1:]):
+                for idx, tens in enumerate(sub_tensorlist):
+                    self.mpi_send_tensor(tens, to_=ridx + 1)
+
+            tensor_list = tensor_list[0]
+        else:
+            num_tensors = len(tensor_list[self.rank])
+            tensor_list = [None for _ in range(num_tensors)]
+            for idx in range(num_tensors):
+                tens = self.mpi_receive_tensor(from_=0)
+                tensor_list[idx] = tens
+
+        self.comm.Barrier()
+
+        return tensor_list
+
+    def to_tensor_list(self):
+        """
+        Return the tensor list of the full MPS. Thus, here there are
+        communications between the different processes and all the tensorlist
+        is returned on process 0
+
+        Returns
+        -------
+        list of np.ndarray or None
+            List of tensors on the rank 0 process, None on the others
+        """
+        return self.mpi_gather_tn()
+
+    def to_statevector(self, qiskit_order=False, max_qubit_equivalent=20):
+        """
+        Serially compute the statevector
+
+        Parameters
+        ----------
+        qiskit_order: bool, optional
+            weather to use qiskit ordering or the theoretical one. For
+            example the state |011> has 0 in the first position for the
+            theoretical ordering, while for qiskit ordering it is on the
+            last position.
+        max_qubit_equivalent: int, optional
+            Maximum number of qubit sites the MPS can have and still be
+            transformed into a statevector.
+            If the number of sites is greater, it will throw an exception.
+            Default to 20.
+
+        Returns
+        -------
+        np.ndarray or None
+            Statevector on process 0, None on the others
+        """
+
+        tensorlist = self.to_tensor_list()
+        if self.rank == 0:
+            mps = MPS.from_tensor_list(tensorlist)
+            statevect = mps.to_statevector(qiskit_order, max_qubit_equivalent)
+        else:
+            statevect = None
+
+        return statevect
+
+    @classmethod
+    def from_tensor_list(
+        cls,
+        tensor_list,
+        conv_params=None,
+        tensor_backend=None,
+        target_device=None,
+    ):
+        """
+        Initialize the MPS tensors using a list of correctly shaped tensors
+
+        Parameters
+        ----------
+        tensor_list : list of ndarrays or cupy arrays
+            List of tensor for initializing the MPS
+        conv_params : :py:class:`TNConvergenceParameters`, optional
+            Convergence parameters for the new MPS. If None, the maximum bond
+            bond dimension possible is assumed, and a cut_ratio=1e-9.
+            Default to None.
+        tensor_backend : `None` or instance of :class:`TensorBackend`
+            Default for `None` is :class:`QteaTensor` with np.complex128 on CPU.
+        target_device: None | str, optional
+            If `None`, take memory device of tensor backend.
+            If string is `any`, do not convert. Otherwise,
+            use string as device string.
+
+        Returns
+        -------
+        obj : :py:class:`MPIMPS`
+            The MPIMPS class
+        """
+        mismatches = [
+            tensor_list[ii].shape[2] != tensor_list[ii + 1].shape[0]
+            for ii in range(len(tensor_list) - 1)
+        ]
+        if any(mismatches):
+            msg = f"Mismatches for tensors equals to True: {mismatches}."
+            raise ValueError(f"Dimension mismatch when constructing MPS:{msg}")
+
+        if conv_params is None:
+            max_bond_dim = max(elem.shape[2] for elem in tensor_list)
+            conv_params = TNConvergenceParameters(max_bond_dimension=int(max_bond_dim))
+        if tensor_backend is None:
+            # Have to resolve it here in case target device is not given
+            tensor_backend = TensorBackend()
+        if target_device is None:
+            target_device = tensor_backend.memory_device
+        elif target_device == "any":
+            target_device = None
+
+        local_dim = [elem.shape[1] for elem in tensor_list]
+        obj = cls(
+            len(tensor_list), conv_params, local_dim, tensor_backend=tensor_backend
+        )
+
+        # Convert data type (lateron device if GPU enabled?)
+        for elem in tensor_list:
+            elem.convert(obj.tensor_backend.dtype, target_device)
+
+        if obj.rank == 0:
+            tensorlist = [
+                tensor_list[
+                    obj.indexes[rank] : obj.indexes[rank + 1]
+                    + int(rank != obj.size - 1)
+                ]
+                for rank in range(obj.size)
+            ]
+        else:
+            list_sizes = obj.indexes[1:] - obj.indexes[:-1] + 1
+            list_sizes[-1] -= 1
+            tensorlist = [
+                [None for _ in range(list_sizes[rank])] for rank in range(obj.size)
+            ]
+
+        tensor_list = obj.mpi_scatter_tn(tensorlist)
+        obj._tensors = tensor_list
+
+        return obj
+
+    @classmethod
+    def from_statevector(
+        cls,
+        statevector,
+        local_dim=2,
+        conv_params=None,
+        tensor_backend=None,
+    ):
+        """Serially decompose the statevector and then initialize the MPS"""
+        mps = MPS.from_statevector(
+            statevector, local_dim, conv_params, tensor_backend=tensor_backend
+        )
+
+        return cls.from_tensor_list(
+            mps.to_tensor_list(), conv_params, tensor_backend=tensor_backend
+        )
+
+    # ---------------------------
+    # ----- MEASURE METHODS -----
+    # ---------------------------
+
+    def meas_local(self, op_list):
+        """
+        Measure a local observable along all sites of the MPS
+
+        Parameters
+        ----------
+        op_list : list of :class:`_AbstractQteaTensor`
+            local operator to measure on each site
+
+        Return
+        ------
+        measures : ndarray, shape (num_sites)
+            Measures of the local operator along each site on rank-0
+        """
+        res = super().meas_local(op_list)
+
+        # Call back on the site 0 the results
+        if self.rank != 0:
+            self.comm.Send([res, self.mpi_dtype[res.dtype.str]], 0)
+            tot_res = None
+        else:
+            tot_res = np.empty(self.tot_sites, dtype=res.dtype)
+            tot_res[: self.num_sites - 1] = res[:-1]
+
+            tidx = self.num_sites - 1
+            for ii in range(1, self.size):
+                num_tensors = self.indexes[ii] - self.indexes[ii - 1]
+                self.comm.Recv(
+                    [tot_res[tidx : tidx + num_tensors], self.mpi_dtype[res.dtype.str]],
+                    ii,
+                )
+                tidx += num_tensors
+
+        return tot_res
+
+    def _get_eff_op_on_pos(self, pos):
+        """
+        Obtain the list of effective operators adjacent
+        to the position pos and the index where they should
+        be contracted
+
+        Parameters
+        ----------
+        pos : int
+            Index of the tensor w.r.t. which we have to retrieve
+            the effective operators
+
+        Returns
+        -------
+        list of IndexedOperators
+            List of effective operators
+        list of ints
+            Indexes where the operators should be contracted
+        """
+        raise NotImplementedError("This function has to be overwritten")
--- a/.venv/lib/python3.12/site-packages/qtealeaves/operators/tnoperators.py
+++ b/.venv/lib/python3.12/site-packages/qtealeaves/operators/tnoperators.py
@@ -0,0 +1,416 @@
+# This code is part of qtealeaves.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""
+Generic base class for operators.
+"""
+
+# pylint: disable=too-many-locals
+
+# pylint: disable-next=no-name-in-module
+from collections import OrderedDict
+
+import numpy as np
+
+from qtealeaves.tooling.operatorstrings import _op_string_mul
+from qtealeaves.tooling.parameterized import _ParameterizedClass
+
+__all__ = ["TNOperators"]
+
+
+class _DefaultMapping:
+    """Callable default mapping to avoid recreating closures during lookups."""
+
+    def __init__(self, default_key):
+        self.default_key = default_key
+
+    # pylint: disable-next=unused-argument
+    def __call__(self, site_idx):
+        return self.default_key
+
+
+class TNOperators(_ParameterizedClass):
+    """
+    Generic class to write operators. This class contains no pre-defined
+    operators. It allows you to start from scratch if no other operator
+    class fulfills your needs.
+
+    **Arguments**
+
+    set_names : list of str, optional
+        Name of the operators sets.
+        Default to `default`
+
+    mapping_func : callable (or `None`), optional
+        Mapping the site index to an operator. Arguments
+        `site_idx` must be accepted.
+        Default to `None` (default mapping to only operator set)
+    """
+
+    def __init__(self, set_names="default", mapping_func=None):
+        if isinstance(set_names, str):
+            set_names = [set_names]
+
+        self._ops_dicts = {}
+        for name in set_names:
+            if not isinstance(name, str):
+                raise TypeError(f"Set names must be str, but got `{type(name)}`.")
+            self._ops_dicts[name] = OrderedDict()
+        self._set_names = tuple(self._ops_dicts.keys())
+        self._one_unique = len(self._set_names) == 1
+
+        if mapping_func is None:
+            self._mapping_func = _DefaultMapping(self._set_names[0])
+        else:
+            self._mapping_func = mapping_func
+
+        # Mapping of operators (to avoid equal operators being defined twice)
+        # Can be set, e.g., for 2nd order operators.
+        self._has_2nd_order = False
+        self._mapping_op = {}
+
+    @property
+    def one_unique(self):
+        """Flag if only one operators set exists (True) or multiple (False)."""
+        return self._one_unique
+
+    @property
+    def mapping_func(self):
+        """Mapping function for site to operator set name."""
+        return self._mapping_func
+
+    @property
+    def set_names(self):
+        """Return operator set names as list of strings."""
+        return list(self._set_names)
+
+    def __len__(self):
+        """Lenght of TNOperators defined as number of operator sets."""
+        return len(self._ops_dicts)
+
+    def __contains__(self, key):
+        """Check if a key is inside the operators."""
+        key_a, key_b = self._parse_key(key)
+        if key_a not in self._ops_dicts:
+            return False
+
+        return key_b in self._ops_dicts[key_a]
+
+    def __delitem__(self, key):
+        """Delete entry in operators."""
+        key_a, key_b = self._parse_key(key)
+        del self._ops_dicts[key_a][key_b]
+
+    def __getitem__(self, key):
+        """Extract entry by key."""
+        key_a, key_b = self._parse_key(key)
+        return self._ops_dicts[key_a][key_b]
+
+    def __setitem__(self, key, value):
+        """Set entry by key."""
+        key_a, key_b = self._parse_key(key, callee_set=True)
+        self._ops_dicts[key_a][key_b] = value
+
+    def __iter__(self):
+        """Iterate through all keys (of all operators sets)."""
+        for key, value in self._ops_dicts.items():
+            for subkey in value:
+                yield (key, subkey)
+
+    def items(self):
+        """Iterate throught all (key, value) pairs of all operators sets."""
+        for key, value in self._ops_dicts.items():
+            for subkey, subvalue in value.items():
+                yield (key, subkey), subvalue
+
+    def _parse_key(self, key, callee_set=False):
+        """
+        Parse the key and split into operator set key and operator name key.
+
+        **Arguments**
+
+        key : tuple (or str)
+            Key as tuple of length two (or operator name).
+
+        callee_set : bool, optional
+            Indicate if callee is `__setitem__`.
+            Default to `False`.
+        """
+        if isinstance(key, str) and self.one_unique:
+            return self._set_names[0], key
+
+        if isinstance(key, str):
+            raise ValueError("Operators are not unique, indicate index.")
+
+        if len(key) != 2:
+            raise ValueError("Operators are not unique, indicate index.")
+
+        if isinstance(key[0], str):
+            # str for operator set name
+            key_0 = key[0]
+        elif isinstance(key[0], (int, np.int64)) and callee_set:
+            raise ValueError("Cannot set entry via integer entry (per site).")
+        elif isinstance(key[0], (int, np.int64)):
+            # int for site, use mapping
+            # pylint: disable-next=not-callable
+            key_0 = self.mapping_func(key[0])
+        else:
+            raise ValueError(f"First entry must be set name or int, but `{key[0]}`.")
+
+        if not isinstance(key[1], str):
+            raise ValueError(
+                f"Second entry must specify operator name, but `{key[1]}`."
+            )
+
+        return key_0, key[1]
+
+    def get_operator(self, site_idx_1d, operator_name, params):
+        """
+        Provide a method to return any operator, either defined via
+        a callable or directly as a matrix.
+
+        **Arguments**
+
+        site_idx_1d : int, str
+            If int, site where we need the operator. Mapping will evaluate what
+            to return.
+            If str, name of operator set.
+
+        operator_name : str
+            Tag/identifier of the operator.
+
+        params : dict
+            Simulation parameters as a dictionary; dict is passed
+            to callable.
+        """
+        if isinstance(site_idx_1d, (int, np.int64)):
+            # pylint: disable-next=not-callable
+            key_0 = self._mapping_func(site_idx_1d)
+        else:
+            key_0 = site_idx_1d
+        op_mat = self.eval_numeric_param(self._ops_dicts[key_0][operator_name], params)
+        return op_mat
+
+    def get_local_links(self, num_sites, params):
+        """
+        Extract the local links from the operators.
+
+        **Arguments**
+
+        num_sites : integer
+            Number of sites.
+
+        params : dict
+            Dictionary with parameterization of the simulation.
+        """
+        local_links = []
+        for ii in range(num_sites):
+            eye = self.get_operator(ii, "id", params)
+
+            if hasattr(eye, "links"):
+                local_links.append(eye.links[1])
+            else:
+                # When constructing H, we call this with numpy tensors
+                local_links.append(eye.shape[0])
+
+        return local_links
+
+    def transform(self, transformation, **kwargs):
+        """
+        Generate a new :class:`TNOperators` by transforming the
+        current instance.
+
+        **Arguments**
+
+        transformation : callable
+            Accepting key and value as arguments plus potential
+            keyword arguments.
+
+        **kwargs : key-word arguments
+            Will be passed to `transformation`
+        """
+        new_ops = TNOperators(set_names=self.set_names, mapping_func=self.mapping_func)
+        for key, value in self.items():
+            new_ops[key] = transformation(key, value, **kwargs)
+
+        return new_ops
+
+    def check_alternative_op(self, set_name, key):
+        """
+        Check entry for alternative operators, i.e., the sigma_x squared
+        is the identity.
+
+        Arguments
+        ---------
+
+        set_name : str
+            Search in this set name of operators. (Set names allow
+            different Hilbert spaces on different sites.)
+
+        key : str
+            Operator represented as key. Check if there is an alternative
+            key for this key.
+
+        Returns
+        -------
+
+        alternative_key : None | str
+            If `None`, no alternative key is given or the corresponding
+            dictionary for checking is not set. If str, then this operator
+            has the same representation as `key`.
+        """
+        set_dict = self._mapping_op.get(set_name, None)
+        if set_dict is None:
+            return None
+
+        return self._mapping_op[set_name].get(key, None)
+
+    # pylint: disable-next=too-many-branches
+    def generate_products_2nd_order(
+        self,
+        left_conj=False,
+        left_transpose=False,
+        right_conj=False,
+        right_transpose=False,
+    ):
+        """
+        Generate all possible multiplications (matrix-matrix multiplications) of
+        the operator set, i.e., [A, B, ...] generators [A*A, A*B, B*A, B*B, ...].
+        Transformation can be taken into account on top.
+
+        Arguments
+        ---------
+
+        left_conj : Boolean
+            Tells if the left operator needs to be complex conjugated.
+            Default is False.
+
+        right_transpose : Boolean
+            Tells if the left operator needs to be transposed.
+            Default is False.
+
+        right_conj : Boolean
+            Tells if the right operator needs to be complex conjugated.
+            Default is False.
+
+        right_transpose : Boolean
+            Tells if the right operator needs to be transposed.
+            Default is False.
+
+        Returns
+        -------
+
+        None (in-place update of the operator dictionary)
+
+        """
+        if self._has_2nd_order:
+            return
+
+        self._has_2nd_order = True
+
+        additional_ops = {}
+        for set_name in self.set_names:
+            additional_ops[set_name] = {}
+
+            for op_str_a, op_a in self._ops_dicts[set_name].items():
+                for op_str_b, op_b in self._ops_dicts[set_name].items():
+                    op_str = _op_string_mul("", op_str_a, left_conj, left_transpose)
+                    op_str = _op_string_mul(
+                        op_str, op_str_b, right_conj, right_transpose
+                    )
+                    op_str = op_str_a + "*" + op_str_b
+                    if (op_str_a == "id") and (op_str_b == "id"):
+                        additional_ops[set_name][op_str] = "id"
+                    elif (
+                        (op_str_a == "id")
+                        and (not right_conj)
+                        and (not right_transpose)
+                    ):
+                        additional_ops[set_name][op_str] = op_str_b
+                    elif (
+                        (op_str_b == "id") and (not left_conj) and (not left_transpose)
+                    ):
+                        additional_ops[set_name][op_str] = op_str_a
+                    elif op_a.has_symmetry:
+                        tmp_a = _op_transformation(op_a, left_conj, left_transpose)
+                        tmp_b = _op_transformation(op_b, right_conj, right_transpose)
+
+                        op = tmp_a.tensordot(tmp_b, [(2,), (1,)])
+                        _, op = op.split_qr([0, 3, 1, 4], [2, 5])
+                        op, _ = op.split_rq([0, 1], [2, 3, 4])
+                        additional_ops[set_name][op_str] = op
+                    else:
+                        tmp_a = _op_transformation(op_a, left_conj, left_transpose)
+                        tmp_b = _op_transformation(op_b, right_conj, right_transpose)
+
+                        op = tmp_a.einsum("ijkl,akbd->ijbl", tmp_b)
+                        additional_ops[set_name][op_str] = op
+
+            # Check they are really new and not identical to existing ones
+            # to get the smallest set
+            to_be_added = {}
+            to_be_reset = {}
+            for key, op in additional_ops[set_name].items():
+                if isinstance(op, str):
+                    continue
+
+                for key_ii, op_ii in self._ops_dicts[set_name].items():
+                    if op.are_equal(op_ii, tol=10 * op.dtype_eps):
+                        to_be_reset[key] = key_ii
+                    else:
+                        to_be_added[key] = op
+                        continue
+
+            for key, value in to_be_added.items():
+                self._ops_dicts[set_name][key] = value
+
+            for key, value in to_be_reset.items():
+                additional_ops[set_name][key] = value
+
+        for set_name, set_dict in additional_ops.items():
+            if set_name not in self._mapping_op:
+                self._mapping_op[set_name] = {}
+
+            for key, value in set_dict.items():
+                self._mapping_op[set_name][key] = value
+
+
+def _op_transformation(op, is_conj, is_transpose):
+    """
+    Carry out the transformation on an operator.
+
+    Arguments
+    ---------
+
+    op : :class:`_AbstractQteaTensor`
+        Tensor to be transformed. We assume rank-4 tensors.
+
+    is_conj : bool
+        Flag if conjugate is applied.
+
+    is_transpose : bool
+        Flag if transpose is applied.
+
+    Returns
+    -------
+
+    new_op : :class:`_AbstractQteaTensor`
+        Operator after the transformations.
+    """
+    if is_conj and is_transpose:
+        new_op = op.conj().transpose([0, 2, 1, 3])
+    elif is_conj:
+        new_op = op.conj()
+    elif is_transpose:
+        new_op = op.transpose([0, 2, 1, 3])
+    else:
+        new_op = op
+
+    return new_op
--- a/.venv/lib/python3.12/site-packages/qtealeaves/tooling/mapping.py
+++ b/.venv/lib/python3.12/site-packages/qtealeaves/tooling/mapping.py
--- a/.venv/lib/python3.12/site-packages/qtealeaves/tooling/permutations.py
+++ b/.venv/lib/python3.12/site-packages/qtealeaves/tooling/permutations.py
@@ -0,0 +1,55 @@
+# This code is part of qtealeaves.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""
+Common permutations often used in tensor network methods.
+"""
+from functools import lru_cache
+
+__all__ = []
+
+
+@lru_cache(maxsize=None)
+def _transpose_idx1(num_legs, contracted_idx):
+    """Move second last index instead of last in `_transpose_idx`."""
+    return _transpose_idx(num_legs - 1, contracted_idx) + (num_legs - 1,)
+
+
+@lru_cache(maxsize=None)
+def _transpose_idx2(num_legs, contracted_idx):
+    """Move third last index instead of last in `_transpose_idx`."""
+    return _transpose_idx(num_legs - 2, contracted_idx) + (
+        num_legs - 2,
+        num_legs - 1,
+    )
+
+
+@lru_cache(maxsize=None)
+def _transpose_idx(num_legs, contracted_idx):
+    """
+    Transpose in the original order the indexes
+    of a n-legs tensor contracted over the
+    index `contracted_idx`
+
+    Parameters
+    ----------
+    contracted_idx : int
+        Index over which there has been a contraction
+
+    Returns
+    -------
+    tuple
+        Indexes for the transposition
+    """
+    if contracted_idx > num_legs - 1:
+        raise ValueError(
+            f"Cannot contract leg {contracted_idx} of tensor with {num_legs} legs"
+        )
+    return (*range(contracted_idx), num_legs - 1, *range(contracted_idx, num_legs - 1))
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/circuit.py
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/tensor_core.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/tensor_core.py
--- a/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
+++ b/.venv/lib/python3.12/site-packages/quimb/tensor/tn1d/core.py
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ Tensor Network Types:
 Tensor Network contractions to:

 - dense vectors
- expecation values of given Pauli string
+- expectation values of given Pauli strings or Pauli-sum observables

 The supported HPC configurations are:

@@ -26,6 +26,27 @@ Currently, the supported tensor network libraries are:
 - [cuQuantum](https://github.com/NVIDIA/cuQuantum), an NVIDIA SDK of optimized libraries and tools for accelerating quantum computing workflows.
 - [quimb](https://quimb.readthedocs.io/en/latest/), an easy but fast python library for ‘quantum information many-body’ calculations, focusing primarily on tensor networks.

+## CPU expectation benchmarks
+
+Use the library APIs directly:
+
+```py
+import qibotn
+
+records = qibotn.run_cpu_benchmark_cases(
+    ansatz="mps",
+    nqubits=40,
+    nlayers=10,
+    bond=2048,
+    circuits=("brickwall_cnot",),
+    observables=("ring_xz",),
+)
+```
+
+For generic TN use `ansatz="tn"`.  Contest/custom runners are available as
+`qibotn.run_contest_tn_case`, `qibotn.run_custom_tn_expectation`,
+`qibotn.run_contest_mps_case`, and `qibotn.run_vidal_validation_cases`.
+
 ## Installation

 To get started:
--- a/data/tree_q25_l10.pkl
+++ b/data/tree_q25_l10.pkl
--- a/data/tree_q25_l10_sliced.pkl
+++ b/data/tree_q25_l10_sliced.pkl
--- a/data/tree_q30_l10.pkl
+++ b/data/tree_q30_l10.pkl
--- a/data/tree_q30_l10_sliced.pkl
+++ b/data/tree_q30_l10_sliced.pkl
--- a/doc/make.bat
+++ b/doc/make.bat
@@ -1,35 +1,35 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/docs/contest_runners.md
+++ b/docs/contest_runners.md
@@ -0,0 +1,12 @@
+# Contest Runners
+
+The reusable implementations live in `src/qibotn/backends/`.
+
+- `qibotn.run_contest_tn_case`: quimb+torch TN search/contract cases.
+- `qibotn.run_contest_mps_case`: Vidal/MPS contest expectation cases.
+- `qibotn.run_vidal_mpi_contest_case`: direct Vidal MPI observable sweep.
+- `qibotn.run_custom_tn_expectation`: custom quimb+torch TN cases.
+
+`src/qibotn/backends/quimb.py` holds the TN helpers,
+`src/qibotn/backends/qmatchatea.py` holds the qmatchatea MPS helpers,
+and `src/qibotn/backends/vidal.py` holds the Vidal helpers.
--- a/docs/home.md
+++ b/docs/home.md
@@ -0,0 +1,26 @@
+# qibotn
+
+Core reusable code lives under `src/qibotn/`. Prefer importing from `qibotn`
+or `qibotn.backends.*`; benchmark and runner helpers have been folded into the
+package instead of being kept as standalone scripts.
+
+- `backends/quimb.py`: TN + torch helpers for quimb.
+- `backends/qmatchatea.py`: qmatchatea + torch MPS helpers.
+- `backends/vidal.py`: Vidal + torch helpers.
+- `contest_cases.py`: shared contest circuits, observables, and case specs.
+- `torch_utils.py`: shared torch array/thread helpers.
+
+Quimb TN reusable entrypoints include `build_quimb_backend_circuit`,
+`build_expectation_tn`, `run_quimb_torch_expectation`,
+`compare_quimb_gate_merge`, `compare_quimb_gate_merge_expectation`,
+`profile_quimb_torch_expectation`, and `time_quimb_contract_implementations`.
+
+Common public imports include `qibotn.cpu_expectation`,
+`qibotn.mps_expectation`, `qibotn.run_qmatchatea_expectation`,
+`qibotn.run_vidal_expectation`, `qibotn.build_contest_circuit`, and
+`qibotn.build_contest_observable`.
+
+Former script entrypoints are available as importable functions:
+`qibotn.run_cpu_benchmark_cases`, `qibotn.run_contest_tn_case`,
+`qibotn.run_custom_tn_expectation`, `qibotn.run_contest_mps_case`,
+`qibotn.run_vidal_mpi_contest_case`, and `qibotn.run_vidal_validation_cases`.
--- a/4
+++ b/4
@@ -0,0 +1,4 @@
+10.20.1.100
+10.20.1.101
+10.20.1.102
+10.20.1.103
--- a/poetry.lock
+++ b/poetry.lock
@@ -1733,14 +1733,14 @@ files = [

 [[package]]
 name = "mako"
-version = "1.3.10"
+version = "1.3.11"
 description = "A super-fast templating language that borrows the best ideas from the existing templating languages."
 optional = false
 python-versions = ">=3.8"
 groups = ["main"]
 files = [
-    {file = "mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59"},
-    {file = "mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28"},
+    {file = "mako-1.3.11-py3-none-any.whl", hash = "sha256:e372c6e333cf004aa736a15f425087ec977e1fcbd2966aae7f17c8dc1da27a77"},
+    {file = "mako-1.3.11.tar.gz", hash = "sha256:071eb4ab4c5010443152255d77db7faa6ce5916f35226eb02dc34479b6858069"},
 ]

 [package.dependencies]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,11 +31,13 @@ cuquantum-python-cu12 = { version = "^25.9.1", optional = true }
 qmatchatea = { version = "^1.4.3", optional = true }
 qiskit = { version = "^1.4.0", optional = true }
 qtealeaves = { version = "^1.5.20", optional = true }
+distributed = { version = ">=2024", optional = true }


 [tool.poetry.extras]
 cuda = ["cupy-cuda12x", "cuda-toolkit", "nvidia-nccl-cu12", "cuquantum-python-cu12", "mpi4py"]
 qmatchatea = ["qmatchatea"]
+dask = ["distributed"]

 [tool.poetry.group.docs]
 optional = true
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,139 @@
+alembic==1.18.4
+annotated-types==0.7.0
+antlr4-python3-runtime==4.13.2
+anyio==4.13.0
+asttokens==3.0.1
+attrs==26.1.0
+autoray==0.8.10
+beautifulsoup4==4.14.3
+certifi==2026.4.22
+cffi==2.0.0
+charset-normalizer==3.4.7
+click==8.3.3
+cloudpickle==3.1.2
+cma==3.4.0
+colorlog==6.10.1
+contourpy==1.3.3
+cotengra==0.7.5
+coverage==7.13.5
+cryptography==47.0.0
+cycler==0.12.1
+cytoolz==1.1.0
+dask==2026.3.0
+decorator==5.2.1
+dill==0.4.1
+distributed==2026.3.0
+executing==2.2.1
+filelock==3.25.2
+fonttools==4.62.1
+fsspec==2026.2.0
+greenlet==3.3.2
+h11==0.16.0
+h5py==3.16.0
+html5lib==1.1
+httpcore==1.0.9
+httpx==0.27.2
+httpx-sse==0.4.3
+idna==3.13
+igraph==1.0.0
+iniconfig==2.3.0
+ipython==8.39.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.3
+jsonschema==4.26.0
+jsonschema-specifications==2025.9.1
+kahypar==1.3.7
+kiwisolver==1.5.0
+llvmlite==0.44.0
+locket==1.0.0
+lxml==6.1.0
+Mako==1.3.10
+markdownify==1.2.2
+MarkupSafe==3.0.3
+matplotlib==3.10.8
+matplotlib-inline==0.2.1
+mcp==1.27.0
+mcp-server-fetch==2025.4.7
+mpi4py==4.1.1
+mpmath==1.3.0
+msgpack==1.1.2
+networkx==3.6.1
+numba==0.61.2
+numpy @ file:///home/yx/numpy
+openqasm3==1.0.1
+opt_einsum==3.4.0
+optuna==4.8.0
+packaging==26.0
+parso==0.8.6
+partd==1.4.2
+pexpect==4.9.0
+pillow==12.2.0
+pluggy==1.6.0
+prompt_toolkit==3.0.52
+Protego==0.6.0
+protobuf==7.34.1
+psutil==5.9.8
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.2
+pycparser==3.0
+pydantic==2.13.3
+pydantic-settings==2.14.0
+pydantic_core==2.46.3
+Pygments==2.20.0
+PyJWT==2.12.1
+pyparsing==3.3.2
+pytest==9.0.3
+pytest-cov==7.1.0
+pytest-env==1.6.0
+python-dateutil==2.9.0.post0
+python-dotenv==1.2.2
+python-multipart==0.0.26
+PyYAML==6.0.3
+qibo==0.3.2
+qibojit==0.1.15
+-e git+https://git.nudt.space/jaunatisblue/qibotn.git@eed42dcfa9739c609a58f7367fe403abf2e992a9#egg=qibotn
+qiskit==1.4.5
+qmatchatea==1.5.8
+qredtea==0.3.15
+qtealeaves==1.7.32
+quimb==1.13.0
+ray==2.55.1
+readabilipy==0.3.0
+referencing==0.37.0
+regex==2026.4.4
+requests==2.33.1
+rpds-py==0.30.0
+rustworkx==0.17.1
+scipy @ file:///home/yx/scipy
+setuptools==70.2.0
+six==1.17.0
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.8.3
+SQLAlchemy==2.0.49
+sse-starlette==3.4.1
+stack-data==0.6.3
+starlette==1.0.0
+stevedore==5.7.0
+symengine==0.13.0
+sympy==1.14.0
+tabulate==0.9.0
+tblib==3.2.2
+texttable==1.7.0
+threadpoolctl==3.6.0
+toolz==1.1.0
+torch==2.11.0+cpu
+torchaudio==2.11.0+cpu
+torchvision==0.26.0+cpu
+tornado==6.5.5
+tqdm==4.67.3
+traitlets==5.14.3
+typing-inspection==0.4.2
+typing_extensions==4.15.0
+urllib3==2.6.3
+uvicorn==0.46.0
+wcwidth==0.6.0
+webencodings==0.5.1
+zict==3.0.0
--- a/src/qibotn/init.py
+++ b/src/qibotn/init.py
@@ -1,5 +1,131 @@
 import importlib.metadata as im

-from qibotn.backends import MetaBackend
-
 __version__ = im.version(__package__)
+
+_LAZY_EXPORTS = {
+    "MetaBackend": ("qibotn.backends", "MetaBackend"),
+    "cpu_backend": ("qibotn.expectation_runner", "cpu_backend"),
+    "cpu_expectation": ("qibotn.expectation_runner", "cpu_expectation"),
+    "mps_expectation": ("qibotn.expectation_runner", "mps_expectation"),
+    "cpu_runcard": ("qibotn.expectation_runner", "cpu_runcard"),
+    "ExpectationConfig": ("qibotn.expectation_runner", "ExpectationConfig"),
+    "exact_for_observable": ("qibotn.expectation_runner", "exact_for_observable"),
+    "run_cpu_expectation": ("qibotn.expectation_runner", "run_cpu_expectation"),
+    "cpu_benchmark_parallel_opts": (
+        "qibotn.expectation_runner",
+        "cpu_benchmark_parallel_opts",
+    ),
+    "run_cpu_benchmark_cases": (
+        "qibotn.expectation_runner",
+        "run_cpu_benchmark_cases",
+    ),
+    "build_benchmark_circuit": ("qibotn.benchmark_cases", "build_circuit"),
+    "benchmark_observable_terms": ("qibotn.benchmark_cases", "observable_terms"),
+    "exact_pauli_sum": ("qibotn.benchmark_cases", "exact_pauli_sum"),
+    "ring_xz_statevector_expectation": (
+        "qibotn.benchmark_cases",
+        "ring_xz_statevector_expectation",
+    ),
+    "terms_to_dict": ("qibotn.benchmark_cases", "terms_to_dict"),
+    "build_contest_circuit": ("qibotn.contest_cases", "build_contest_circuit"),
+    "build_contest_observable": (
+        "qibotn.contest_cases",
+        "build_contest_observable",
+    ),
+    "contest_cases": ("qibotn.contest_cases", "CASES"),
+    "analyze_contraction_tree": ("qibotn.parallel", "analyze_contraction_tree"),
+    "load_tree_payload": ("qibotn.parallel", "load_tree_payload"),
+    "save_tree_payload": ("qibotn.parallel", "save_tree_payload"),
+    "slice_tree_payload": ("qibotn.parallel", "slice_tree_payload"),
+    "make_qmatchatea_backend": (
+        "qibotn.backends.qmatchatea",
+        "make_qmatchatea_backend",
+    ),
+    "build_qmatchatea_backend": (
+        "qibotn.backends.qmatchatea",
+        "build_qmatchatea_backend",
+    ),
+    "benchmark_qmatchatea_svd_control": (
+        "qibotn.backends.qmatchatea",
+        "benchmark_qmatchatea_svd_control",
+    ),
+    "run_qmatchatea_expectation": (
+        "qibotn.backends.qmatchatea",
+        "run_qmatchatea_expectation",
+    ),
+    "exact_mps_expectation": (
+        "qibotn.backends.qmatchatea",
+        "exact_mps_expectation",
+    ),
+    "make_vidal_backend": ("qibotn.backends.vidal", "make_vidal_backend"),
+    "compare_vidal_backend_qmatchatea": (
+        "qibotn.backends.vidal",
+        "compare_vidal_backend_qmatchatea",
+    ),
+    "run_vidal_expectation": ("qibotn.backends.vidal", "run_vidal_expectation"),
+    "run_segmented_vidal_ring_xz": (
+        "qibotn.backends.vidal",
+        "run_segmented_vidal_ring_xz",
+    ),
+    "build_expectation_tn": ("qibotn.backends.quimb", "build_expectation_tn"),
+    "build_quimb_circuit_stats": (
+        "qibotn.backends.quimb",
+        "build_quimb_circuit_stats",
+    ),
+    "compare_quimb_gate_merge": (
+        "qibotn.backends.quimb",
+        "compare_quimb_gate_merge",
+    ),
+    "compare_quimb_gate_merge_expectation": (
+        "qibotn.backends.quimb",
+        "compare_quimb_gate_merge_expectation",
+    ),
+    "contract_tn": ("qibotn.backends.quimb", "contract_tn"),
+    "load_custom_case_module": ("qibotn.backends.quimb", "load_custom_case_module"),
+    "profile_quimb_torch_expectation": (
+        "qibotn.backends.quimb",
+        "profile_quimb_torch_expectation",
+    ),
+    "qibo_circuit_to_quimb_torch": (
+        "qibotn.backends.quimb",
+        "qibo_circuit_to_quimb_torch",
+    ),
+    "search_contraction_tree": ("qibotn.backends.quimb", "search_contraction_tree"),
+    "sorted_tree": ("qibotn.backends.quimb", "sorted_tree"),
+    "run_contest_tn_case": ("qibotn.backends.quimb", "run_contest_tn_case"),
+    "run_custom_tn_expectation": (
+        "qibotn.backends.quimb",
+        "run_custom_tn_expectation",
+    ),
+    "time_quimb_contract_implementations": (
+        "qibotn.backends.quimb",
+        "time_quimb_contract_implementations",
+    ),
+    "run_contest_mps_case": ("qibotn.backends.vidal", "run_contest_mps_case"),
+    "run_vidal_mpi_contest_case": (
+        "qibotn.backends.vidal",
+        "run_vidal_mpi_contest_case",
+    ),
+    "run_vidal_validation_cases": (
+        "qibotn.backends.vidal",
+        "run_vidal_validation_cases",
+    ),
+    "pauli_pattern": ("qibotn.observables", "pauli_pattern"),
+    "pauli_sum": ("qibotn.observables", "pauli_sum"),
+}
+
+
+def __getattr__(name):
+    try:
+        module_name, object_name = _LAZY_EXPORTS[name]
+    except KeyError:
+        raise AttributeError(f"module {__name__!r} has no attribute {name!r}") from None
+
+    from importlib import import_module
+
+    value = getattr(import_module(module_name), object_name)
+    globals()[name] = value
+    return value
+
+
+__all__ = sorted([*_LAZY_EXPORTS, "__version__"])
--- a/src/qibotn/backends/init.py
+++ b/src/qibotn/backends/init.py
@@ -1,11 +1,8 @@
-from typing import Union
-
 from qibo.config import raise_error

 from qibotn.backends.abstract import QibotnBackend
-from qibotn.backends.cutensornet import CuTensorNet  # pylint: disable=E0401

-PLATFORMS = ("cutensornet", "quimb", "qmatchatea")
+PLATFORMS = ("cutensornet", "cpu", "quimb", "qmatchatea", "vidal")


 class MetaBackend:
@@ -23,11 +20,17 @@ class MetaBackend:
        """

        if platform == "cutensornet":  # pragma: no cover
+            from qibotn.backends.cutensornet import CuTensorNet
+
            return CuTensorNet(runcard)
+        elif platform == "cpu":
+            from qibotn.backends.cpu import CpuTensorNet
+
+            return CpuTensorNet(runcard)
        elif platform == "quimb":  # pragma: no cover
            import qibotn.backends.quimb as qmb

-            quimb_backend = kwargs.get("quimb_backend", "numpy")
+            quimb_backend = kwargs.get("quimb_backend", "torch")
            contraction_optimizer = kwargs.get("contraction_optimizer", "auto-hq")
            return qmb.BACKENDS[quimb_backend](
                quimb_backend=quimb_backend, contraction_optimizer=contraction_optimizer
@@ -36,6 +39,10 @@ class MetaBackend:
            from qibotn.backends.qmatchatea import QMatchaTeaBackend

            return QMatchaTeaBackend()
+        elif platform == "vidal":
+            from qibotn.backends.vidal import VidalBackend
+
+            return VidalBackend()
        else:
            raise_error(
                NotImplementedError,
@@ -48,8 +55,8 @@ class MetaBackend:
        for platform in PLATFORMS:
            try:
                MetaBackend.load(platform=platform)
-                available = True
-            except:
-                available = False
-            available_backends[platform] = available
+            except (ImportError, NotImplementedError, TypeError, ValueError):
+                available_backends[platform] = False
+            else:
+                available_backends[platform] = True
        return available_backends
--- a/src/qibotn/backends/cpu.py
+++ b/src/qibotn/backends/cpu.py
@@ -0,0 +1,729 @@
+"""CPU tensor-network backend with cutensornet-style runcard support."""
+
+from __future__ import annotations
+
+import os
+import pickle
+import time
+from pathlib import Path
+
+import numpy as np
+from qibo import hamiltonians
+from qibo.backends import NumpyBackend
+from qibo.config import raise_error
+
+from qibotn.backends.abstract import QibotnBackend
+from qibotn.backends.vidal import (
+    _observable_mpo_tensors,
+    _unsupported_reason,
+)
+from qibotn.observables import check_observable
+from qibotn.torch_utils import arrays_to_backend, torch_cpu_array, torch_dtype
+
+
+def _as_bool_or_dict(value, name):
+    if isinstance(value, (bool, dict)):
+        return value
+    raise TypeError(f"{name} has an unexpected type")
+
+
+def _bind_numa_node(rank):
+    """Bind the calling process (or thread) to the NUMA node for *rank*.
+
+    The MPI rank is converted to a local (per-node) rank through the
+    environment variables commonly set by Open MPI, MVAPICH, and Slurm.
+    The process CPU affinity and NUMA memory policy are set accordingly.
+
+    Returns the NUMA domain that was selected, or ``None`` if the binding
+    could not be determined.
+    """
+    current_affinity = os.sched_getaffinity(0)
+    online_cpus = set(range(os.cpu_count() or 1))
+    if current_affinity and current_affinity != online_cpus:
+        # MPI launchers such as Intel MPI often pin local ranks correctly
+        # before Python starts.  Do not narrow that placement further.
+        return None
+
+    local_rank = rank
+    for name in (
+        "OMPI_COMM_WORLD_LOCAL_RANK",
+        "MV2_COMM_WORLD_LOCAL_RANK",
+        "MPI_LOCALRANKID",
+        "I_MPI_LOCAL_RANK",
+        "SLURM_LOCALID",
+    ):
+        try:
+            local_rank = int(os.environ[name])
+            break
+        except (KeyError, ValueError):
+            pass
+
+    domains = _available_numa_domains()
+    if not domains:
+        return None
+
+    local_size = _local_world_size()
+    assigned_domains = domains[local_rank::local_size]
+    if not assigned_domains:
+        assigned_domains = [domains[local_rank % len(domains)]]
+
+    domain = assigned_domains[0]
+    cpus = set()
+    for selected in assigned_domains:
+        cpulist = f"/sys/devices/system/node/node{selected}/cpulist"
+        try:
+            cpus.update(_parse_cpu_list(open(cpulist, encoding="utf-8").read().strip()))
+        except (FileNotFoundError, OSError):
+            pass
+    try:
+        if cpus:
+            os.sched_setaffinity(0, cpus)
+    except OSError:
+        pass
+
+    try:
+        import ctypes
+
+        libnuma = ctypes.CDLL("libnuma.so.1")
+        if libnuma.numa_available() >= 0:
+            libnuma.numa_run_on_node(ctypes.c_int(domain))
+            libnuma.numa_set_preferred(ctypes.c_int(domain))
+    except Exception:
+        pass
+
+    return domain
+
+
+def _available_numa_domains():
+    nodes = []
+    base = Path("/sys/devices/system/node")
+    try:
+        for path in base.glob("node[0-9]*"):
+            try:
+                nodes.append(int(path.name[4:]))
+            except ValueError:
+                pass
+    except OSError:
+        return []
+    return sorted(nodes)
+
+
+def _local_world_size():
+    for name in (
+        "OMPI_COMM_WORLD_LOCAL_SIZE",
+        "MV2_COMM_WORLD_LOCAL_SIZE",
+        "MPI_LOCALNRANKS",
+        "I_MPI_LOCAL_SIZE",
+        "SLURM_NTASKS_PER_NODE",
+    ):
+        value = os.environ.get(name)
+        if not value:
+            continue
+        try:
+            return max(1, int(str(value).split("(", 1)[0]))
+        except ValueError:
+            pass
+    return 1
+
+
+def _parse_cpu_list(text):
+    cpus = set()
+    for item in text.split(","):
+        item = item.strip()
+        if not item:
+            continue
+        if "-" in item:
+            start, stop = item.split("-", 1)
+            cpus.update(range(int(start), int(stop) + 1))
+        else:
+            cpus.add(int(item))
+    return cpus
+
+
+class CpuTensorNet(QibotnBackend, NumpyBackend):
+    """CPU replacement for the cutensornet runcard execution surface.
+
+    The backend preserves the high-level runcard knobs used by the GPU backend:
+    ``MPI_enabled``, ``MPS_enabled`` and ``expectation_enabled``.  Generic TN
+    work is delegated to quimb on CPU; MPS expectation uses the Vidal fast path
+    when the circuit is nearest-neighbor and falls back to quimb otherwise.
+    """
+
+    def __init__(self, runcard=None):
+        super().__init__()
+        self.name = "qibotn"
+        self.platform = "cpu"
+        self.precision = "double"
+        self.configure_tn_simulation(runcard)
+
+    def configure_tn_simulation(self, runcard=None):
+        runcard = {} if runcard is None else runcard
+        self.rank = 0
+        self.MPI_enabled = bool(runcard.get("MPI_enabled", False))
+        self.NCCL_enabled = bool(runcard.get("NCCL_enabled", False))
+        if self.NCCL_enabled:
+            raise_error(NotImplementedError, "NCCL is only available for GPU backends.")
+
+        expectation = runcard.get("expectation_enabled", False)
+        if expectation is True:
+            self.expectation_enabled = True
+            self.observable = None
+        elif expectation is False:
+            self.expectation_enabled = False
+            self.observable = None
+        elif isinstance(expectation, (dict, hamiltonians.SymbolicHamiltonian)):
+            self.expectation_enabled = True
+            self.observable = expectation
+        else:
+            raise TypeError("expectation_enabled has an unexpected type")
+
+        mps = _as_bool_or_dict(runcard.get("MPS_enabled", False), "MPS_enabled")
+        self.MPS_enabled = bool(mps)
+        self.mps_options = mps if isinstance(mps, dict) else {}
+
+        self.max_bond_dimension = runcard.get(
+            "max_bond_dimension",
+            self.mps_options.get("max_bond_dimension", 512),
+        )
+        self.cut_ratio = runcard.get(
+            "cut_ratio",
+            self.mps_options.get(
+                "cut_ratio",
+                self.mps_options.get("svd_method", {}).get("abs_cutoff", 1e-12),
+            ),
+        )
+        self.tensor_module = runcard.get("tensor_module", "torch")
+        self.dtype = runcard.get("dtype", "complex128")
+        self.compile_circuit = bool(runcard.get("compile_circuit", False))
+        self.preprocess = bool(runcard.get("preprocess", False))
+        self.mpi_term_batch_size = runcard.get(
+            "mpi_term_batch_size",
+            runcard.get("term_batch_size", None),
+        )
+        self.torch_threads = runcard.get("torch_threads", None)
+        self.quimb_backend = runcard.get("quimb_backend", "torch")
+        self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq")
+        self.parallel_opts = runcard.get("parallel_opts", {})
+        self.parallel_stats = []
+
+    def execute_circuit(
+        self,
+        circuit,
+        initial_state=None,
+        nshots=None,
+        prob_type=None,
+        return_array=False,
+        **prob_kwargs,
+    ):
+        if initial_state is not None:
+            raise_error(NotImplementedError, "QiboTN CPU backend does not support initial state.")
+
+        if self.torch_threads is not None and self.tensor_module == "torch":
+            import torch
+
+            torch.set_num_threads(self.torch_threads)
+
+        if self.expectation_enabled:
+            value = self.expectation(circuit, self.observable)
+            if self.MPI_enabled and self.rank > 0:
+                return np.asarray([0], dtype=np.int64)
+            dtype = np.complex128 if np.iscomplexobj(value) else np.float64
+            return np.asarray([value], dtype=dtype)
+
+        backend = self._quimb_backend()
+        backend.configure_tn_simulation(
+            ansatz="mps" if self.MPS_enabled else None,
+            max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None,
+            svd_cutoff=self.cut_ratio,
+        )
+        return backend.execute_circuit(
+            circuit=circuit,
+            nshots=nshots,
+            return_array=return_array,
+        )
+
+    def expectation(self, circuit, observable=None, preprocess=None, compile_circuit=None):
+        mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
+        if mpo_tensors is None:
+            observable = check_observable(observable, circuit.nqubits)
+        use_preprocess = self.preprocess if preprocess is None else preprocess
+        if mpo_tensors is not None and not self.MPS_enabled:
+            raise_error(
+                NotImplementedError,
+                "MPO expectation is currently supported only by the Vidal MPS path.",
+            )
+
+        if self.MPS_enabled:
+            reason = _unsupported_reason(circuit)
+            if reason is None or self.compile_circuit or use_preprocess:
+                return self._vidal_expectation(
+                    circuit,
+                    observable,
+                    preprocess=use_preprocess,
+                    compile_circuit=compile_circuit,
+                )
+
+        backend = self._quimb_backend()
+        backend.configure_tn_simulation(
+            ansatz="mps" if self.MPS_enabled else None,
+            max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None,
+            svd_cutoff=self.cut_ratio,
+        )
+        if self.MPI_enabled:
+            return self._quimb_expectation_mpi(backend, circuit, observable)
+        return self._quimb_expectation_processpool(backend, circuit, observable)
+
+    def _vidal_expectation(
+        self, circuit, observable, preprocess=False, compile_circuit=None
+    ):
+        if compile_circuit is None:
+            compile_circuit = self.compile_circuit
+        if self.MPI_enabled:
+            from mpi4py import MPI
+
+            self.rank = MPI.COMM_WORLD.Get_rank()
+
+        from qibotn.backends.vidal import VidalBackend
+
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=self.max_bond_dimension,
+            cut_ratio=self.cut_ratio,
+            tensor_module=self.tensor_module,
+            compile_circuit=compile_circuit,
+            mpi_approach="CT" if self.MPI_enabled else "SR",
+            mpi_term_batch_size=self.mpi_term_batch_size,
+            fallback=False,
+        )
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=preprocess,
+            compile_circuit=compile_circuit,
+        )
+        self.rank = getattr(backend, "rank", self.rank)
+        self.last_truncation_error = getattr(backend, "last_truncation_error", np.nan)
+        self.last_max_truncation_error = getattr(
+            backend, "last_max_truncation_error", np.nan
+        )
+        return value
+
+    def _quimb_backend(self):
+        import qibotn.backends.quimb as qmb
+
+        backend = qmb.BACKENDS[self.quimb_backend](
+            quimb_backend=self.quimb_backend,
+            contraction_optimizer=self.contraction_optimizer,
+        )
+        backend.dtype = self.dtype
+        return backend
+
+    def _bind_rank_to_numa_domain(self, rank):
+        self.numa_domain = _bind_numa_node(rank)
+
+    def _default_search_workers(self, nranks=1):
+        if self.torch_threads:
+            return max(1, int(self.torch_threads))
+        return max(1, (os.cpu_count() or 1) // max(1, nranks))
+
+    def _quimb_expectation_processpool(self, backend, circuit, observable):
+        return self._quimb_expectation_search(
+            backend,
+            circuit,
+            observable,
+            method="processpool",
+            comm=None,
+        )
+
+    def _quimb_expectation_mpi(self, backend, circuit, observable):
+        from mpi4py import MPI
+
+        comm = MPI.COMM_WORLD
+        self.rank = comm.Get_rank()
+        self._bind_rank_to_numa_domain(self.rank)
+
+        return self._quimb_expectation_search(
+            backend,
+            circuit,
+            observable,
+            method="mpi",
+            comm=comm,
+        )
+
+    def _quimb_expectation_search(self, backend, circuit, observable, method, comm=None):
+        rank = comm.Get_rank() if comm is not None else 0
+        size = comm.Get_size() if comm is not None else 1
+        self.rank = rank
+
+        from qibotn.observables import extract_gates_and_qubits
+        from qibotn.parallel import (
+            contraction_tree_costs,
+            parallel_contract,
+            parallel_path_search,
+        )
+        from qibotn.backends.quimb import (
+            PAULI_DENSE_MAX_QUBITS,
+            _pauli_term_to_dense_operator,
+            pauli_product_expectation_tn,
+        )
+
+        opts = dict(self.parallel_opts)
+        user_slicing_opts = opts.get("slicing_opts")
+        search_workers = opts.get("search_workers", self._default_search_workers(size))
+        search_repeats = opts.get("max_repeats", 128)
+        search_time = opts.get("max_time", 60)
+        search_backend = opts.get("search_backend")
+        dask_address = opts.get("dask_address")
+        dask_expected_workers = opts.get("dask_expected_workers")
+        dask_close_workers = bool(opts.get("dask_close_workers", False))
+        print_stats = bool(opts.get("print_stats", False))
+        debug_trials = bool(opts.get("debug_trials", False))
+        search_seed = int(opts.get("search_seed", 0))
+        merge_1q = opts.get("merge_1q", "auto")
+        merge_2q = opts.get("merge_2q", "auto")
+        sort_contract_indices = opts.get("sort_contract_indices", "auto")
+        if sort_contract_indices == "auto":
+            sort_contract_indices = self.quimb_backend == "torch"
+        search_only = bool(opts.get("search_only", False))
+        save_tree_path = opts.get("save_tree_path")
+        load_tree_path = opts.get("load_tree_path")
+        loaded_trees = None
+        saved_trees = []
+        saved_costs = []
+
+        def term_stats(
+            term_index,
+            factors,
+            path_cost,
+            search_stats,
+            tree_slices,
+            slice_assignment,
+            rank_slices,
+            search_seconds,
+            contract_seconds,
+        ):
+            return {
+                "term_index": term_index,
+                "term_factors": tuple(factors),
+                "path_cost": path_cost,
+                "search_stats": search_stats,
+                "tree_slices": tree_slices,
+                "slice_assignment": slice_assignment,
+                "rank_slices": rank_slices,
+                "search_seconds": search_seconds,
+                "contract_seconds": contract_seconds,
+                "search_workers": search_workers,
+                "search_repeats": search_repeats,
+                "search_time": search_time,
+                "search_backend": search_backend or method,
+                "search_seed": search_seed,
+                "merge_1q": merge_1q,
+                "merge_2q": merge_2q,
+                "dask_address": dask_address,
+                "numa_domain": getattr(self, "numa_domain", None),
+            }
+
+        if load_tree_path:
+            with Path(load_tree_path).open("rb") as f:
+                payload = pickle.load(f)
+            loaded_trees = payload["trees"] if isinstance(payload, dict) else payload
+            if not isinstance(loaded_trees, (list, tuple)):
+                loaded_trees = [loaded_trees]
+
+        qc = backend._qibo_circuit_to_quimb(
+            circuit,
+            quimb_circuit_type=backend.circuit_ansatz,
+            gate_opts={
+                "max_bond": self.max_bond_dimension,
+                "cutoff": self.cut_ratio,
+            },
+            merge_1q=merge_1q,
+            merge_2q=merge_2q,
+        )
+
+        total_value = 0.0 + 0.0j
+        terms = extract_gates_and_qubits(observable)
+        for term_index, (coeff, factors) in enumerate(terms):
+            if not factors:
+                if self.rank == 0:
+                    total_value += coeff
+                continue
+
+            if len(factors) > PAULI_DENSE_MAX_QUBITS:
+                tn = pauli_product_expectation_tn(
+                    qc,
+                    factors,
+                    simplify_sequence="ADCRS",
+                    simplify_atol=1e-12,
+                )
+            else:
+                op, where = _pauli_term_to_dense_operator(factors)
+                if self.quimb_backend == "torch":
+                    op = torch_cpu_array(op, dtype=torch_dtype(self.dtype))
+                tn = qc.local_expectation(
+                    op,
+                    where,
+                    rehearse="tn",
+                    simplify_sequence="ADCRS",
+                    simplify_atol=1e-12,
+                )
+            slicing_opts = self._mpi_slicing_opts(
+                user_slicing_opts,
+            )
+
+            if loaded_trees is not None:
+                if term_index >= len(loaded_trees):
+                    raise ValueError(
+                        f"Loaded tree file has {len(loaded_trees)} tree(s), "
+                        f"but term {term_index} was requested."
+                    )
+                tree = loaded_trees[term_index]
+                search_seconds = 0.0
+                if self.rank == 0 and print_stats:
+                    print(
+                        f"tn_tree_loaded term={term_index} path={load_tree_path}",
+                        flush=True,
+                    )
+            else:
+                search_start = time.perf_counter()
+                tree = parallel_path_search(
+                    tn,
+                    tn.outer_inds(),
+                    method="dask" if method != "mpi" and search_backend == "dask" else method,
+                    total_repeats=search_repeats,
+                    max_time=search_time,
+                    n_workers=search_workers,
+                    slicing_opts=slicing_opts,
+                    trial_timeout=opts.get("trial_timeout"),
+                    search_backend=search_backend,
+                    dask_address=dask_address,
+                    debug_trials=debug_trials,
+                    dask_close_workers=dask_close_workers,
+                    expected_workers=dask_expected_workers,
+                    search_seed=search_seed,
+                )
+                search_seconds = time.perf_counter() - search_start
+            if tree is None:
+                raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
+            if sort_contract_indices and hasattr(tree, "sort_contraction_indices"):
+                tree.sort_contraction_indices(
+                    priority=opts.get("sort_contract_indices_priority", "flops"),
+                    make_output_contig=True,
+                    make_contracted_contig=True,
+                    reset=True,
+                )
+            if self.parallel_opts.get("contract_implementation") == "cpp":
+                from qibotn.torch_contractor import prepare_torch_cpp_contractor
+
+                prepare_torch_cpp_contractor(tree)
+
+            path_cost = contraction_tree_costs(tree)
+            search_stats = getattr(tree, "qibotn_search_stats", {})
+            if save_tree_path and loaded_trees is None:
+                saved_trees.append(tree)
+                saved_costs.append(path_cost)
+            if self.rank == 0 and print_stats:
+                print(
+                    "tn_search_done "
+                    f"term={term_index} "
+                    f"search_seconds={search_seconds:.3f} "
+                    f"completed_trials={search_stats.get('completed_trials', 'na')} "
+                    f"finite_trials={search_stats.get('finite_trials', 'na')} "
+                    f"failed_trials={search_stats.get('failed_trials', 'na')} "
+                    f"requested_trials={search_stats.get('requested_trials', search_repeats)} "
+                    f"best_score={search_stats.get('best_score', float('nan')):.6g} "
+                    f"slices={path_cost['nslices']} "
+                    f"log10_flops={path_cost['log10_flops']:.3f} "
+                    f"log10_write={path_cost['log10_write']:.3f} "
+                    f"log2_size={path_cost['log2_size']:.3f} "
+                    f"log10_combo={path_cost['log10_combo']:.3f} "
+                    f"peak_memory_gib={path_cost['peak_memory_gib']:.6g}",
+                    flush=True,
+                )
+
+            if search_only:
+                self.parallel_stats.append(
+                    term_stats(
+                        term_index,
+                        factors,
+                        path_cost,
+                        search_stats,
+                        int(getattr(tree, "multiplicity", 1)),
+                        "search_only",
+                        [],
+                        search_seconds,
+                        0.0,
+                    )
+                )
+                continue
+
+            if comm is None and int(getattr(tree, "multiplicity", 1)) <= 1:
+                if self.rank == 0:
+                    contract_start = time.perf_counter()
+                    value = self._contract_term_unsliced(tn, tree, backend)
+                    contract_seconds = time.perf_counter() - contract_start
+                    if print_stats:
+                        print(
+                            "tn_contract_done "
+                            f"term={term_index} "
+                            f"contract_seconds={contract_seconds:.3f}",
+                            flush=True,
+                        )
+                    self.parallel_stats.append(
+                        term_stats(
+                            term_index,
+                            factors,
+                            path_cost,
+                            search_stats,
+                            1,
+                            "root",
+                            [1] + [0] * (size - 1),
+                            search_seconds,
+                            contract_seconds,
+                        )
+                    )
+                    total_value += coeff * complex(value)
+                continue
+
+            if comm is None:
+                contract_start = time.perf_counter()
+                value = self._contract_term_unsliced(tn, tree, backend)
+                contract_seconds = time.perf_counter() - contract_start
+                if print_stats:
+                    print(
+                        "tn_contract_done "
+                        f"term={term_index} "
+                        f"contract_seconds={contract_seconds:.3f}",
+                        flush=True,
+                    )
+                self.parallel_stats.append(
+                    term_stats(
+                        term_index,
+                        factors,
+                        path_cost,
+                        search_stats,
+                        int(getattr(tree, "multiplicity", 1)),
+                        "local",
+                        [int(getattr(tree, "multiplicity", 1))],
+                        search_seconds,
+                        contract_seconds,
+                    )
+                )
+                total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
+                continue
+
+            contract_start = time.perf_counter()
+            arrays = self._term_arrays(tn, backend)
+            contract_implementation = self._contract_implementation(backend)
+            value, stats = parallel_contract(
+                tree,
+                arrays,
+                method="mpi",
+                comm=comm,
+                return_stats=True,
+                implementation=contract_implementation,
+            )
+            contract_seconds = time.perf_counter() - contract_start
+            gathered_stats = comm.gather(stats, root=0)
+            if rank == 0:
+                if print_stats:
+                    print(
+                        "tn_contract_done "
+                        f"term={term_index} "
+                        f"contract_seconds={contract_seconds:.3f}",
+                        flush=True,
+                    )
+                self.parallel_stats.append(
+                    term_stats(
+                        term_index,
+                        factors,
+                        path_cost,
+                        search_stats,
+                        stats.nslices,
+                        stats.assignment,
+                        [item.local_slices for item in gathered_stats],
+                        search_seconds,
+                        contract_seconds,
+                    )
+                )
+                total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
+
+        if self.rank == 0 and save_tree_path and loaded_trees is None:
+            path = Path(save_tree_path)
+            path.parent.mkdir(parents=True, exist_ok=True)
+            with path.open("wb") as f:
+                pickle.dump(
+                    {
+                        "trees": saved_trees,
+                        "costs": saved_costs,
+                        "nterms": len(saved_trees),
+                    },
+                    f,
+                    protocol=pickle.HIGHEST_PROTOCOL,
+                )
+            if print_stats:
+                print(
+                    f"tn_tree_saved path={save_tree_path} nterms={len(saved_trees)}",
+                    flush=True,
+                )
+
+        if search_only:
+            return np.nan
+
+        return np.nan if rank != 0 else float(np.real(total_value))
+
+    def _contract_implementation(self, backend):
+        implementation = self.parallel_opts.get("contract_implementation")
+        if implementation is None and backend.backend == "torch":
+            return "autoray"
+        return implementation
+
+    def _contract_term_unsliced(self, tn, tree, backend):
+        contract_implementation = self._contract_implementation(backend)
+        if contract_implementation == "cpp":
+            if backend.backend != "torch":
+                raise ValueError("contract_implementation='cpp' requires torch backend.")
+            from qibotn.torch_contractor import contract_tree_cpp
+
+            arrays = arrays_to_backend(tn.arrays, "torch", dtype=self.dtype)
+            nslices = int(getattr(tree, "multiplicity", 1))
+            if nslices > 1:
+                total = None
+                for slice_id in range(nslices):
+                    value = contract_tree_cpp(tree, tree.slice_arrays(arrays, slice_id))
+                    total = value if total is None else total + value
+                return total
+            return contract_tree_cpp(tree, arrays)
+
+        if backend.backend == "torch":
+            for tensor in tn.tensors:
+                tensor._data = torch_cpu_array(
+                    tensor._data,
+                    dtype=torch_dtype(self.dtype),
+                )
+            return tn.contract(
+                all,
+                output_inds=(),
+                optimize=tree,
+                backend="torch",
+                implementation=contract_implementation,
+            )
+
+        return tn.contract(
+            all,
+            output_inds=(),
+            optimize=tree,
+            backend=backend.backend,
+            implementation=contract_implementation,
+        )
+
+    def _mpi_slicing_opts(self, user_slicing_opts):
+        return None if user_slicing_opts is None else dict(user_slicing_opts)
+
+    def _term_arrays(self, tn, backend):
+        return arrays_to_backend(
+            tn.arrays,
+            backend.backend,
+            engine=backend.engine,
+            dtype=self.dtype,
+        )
--- a/src/qibotn/backends/cutensornet_helpers.py
+++ b/src/qibotn/backends/cutensornet_helpers.py
@@ -0,0 +1,321 @@
+"""cuTensorNet circuit and MPS conversion helpers."""
+
+from __future__ import annotations
+
+import numpy as np
+
+try:
+    import cupy as cp
+    import cuquantum.bindings.cutensornet as cutn
+    from cuquantum.tensornet import contract, contract_path
+    from cuquantum.tensornet.experimental import contract_decompose
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+    cutn = None
+    contract = None
+    contract_path = None
+    contract_decompose = None
+
+
+def _require_cupy():
+    if cp is None:
+        raise ImportError(
+            "The cuQuantum circuit converter requires cupy. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+    return cp
+
+
+def _require_cutensornet():
+    if cp is None or cutn is None:
+        raise ImportError(
+            "The cuQuantum MPS converter requires cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+def _require_tensornet_mps():
+    if cp is None or contract is None or contract_decompose is None:
+        raise ImportError(
+            "The cuQuantum MPS helpers require cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+def _require_contract():
+    if contract is None or contract_path is None:
+        raise ImportError(
+            "The cuQuantum MPS contraction helper requires cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )
+
+
+class QiboCircuitToEinsum:
+    """Convert a Qibo circuit to cuQuantum interleaved TN operands."""
+
+    def __init__(self, circuit, dtype="complex128"):
+        self.backend = _require_cupy()
+        self.dtype = getattr(self.backend, dtype)
+        self.init_basis_map(self.backend, dtype)
+        self.init_intermediate_circuit(circuit)
+        self.circuit = circuit
+
+    def state_vector_operands(self):
+        input_bitstring = "0" * len(self.active_qubits)
+        input_operands = self._get_bitstring_tensors(input_bitstring)
+        mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits(
+            self.active_qubits
+        )
+        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
+            self.gate_tensors, qubits_frontier, next_frontier
+        )
+        operands = input_operands + gate_operands
+        mode_labels += gate_mode_labels
+        out_list = [qubits_frontier[key] for key in qubits_frontier]
+        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
+        operand_exp_interleave.append(out_list)
+        return operand_exp_interleave
+
+    def _init_mode_labels_from_qubits(self, qubits):
+        nqubits = len(qubits)
+        frontier_dict = {q: i for i, q in enumerate(qubits)}
+        mode_labels = [[i] for i in range(nqubits)]
+        return mode_labels, frontier_dict, nqubits
+
+    def _get_bitstring_tensors(self, bitstring):
+        return [self.basis_map[ibit] for ibit in bitstring]
+
+    def _parse_gates_to_mode_labels_operands(self, gates, qubits_frontier, next_frontier):
+        mode_labels = []
+        operands = []
+        for tensor, gate_qubits in gates:
+            operands.append(tensor)
+            input_mode_labels = []
+            output_mode_labels = []
+            for qubit in gate_qubits:
+                input_mode_labels.append(qubits_frontier[qubit])
+                output_mode_labels.append(next_frontier)
+                qubits_frontier[qubit] = next_frontier
+                next_frontier += 1
+            mode_labels.append(output_mode_labels + input_mode_labels)
+        return mode_labels, operands
+
+    def op_shape_from_qubits(self, nqubits):
+        return (2, 2) * nqubits
+
+    def init_intermediate_circuit(self, circuit):
+        self.gate_tensors = []
+        gates_qubits = []
+        for gate in circuit.queue:
+            gate_qubits = gate.control_qubits + gate.target_qubits
+            gates_qubits.extend(gate_qubits)
+            required_shape = self.op_shape_from_qubits(len(gate_qubits))
+            self.gate_tensors.append(
+                (
+                    self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
+                        required_shape
+                    ),
+                    gate_qubits,
+                )
+            )
+        self.active_qubits = np.unique(gates_qubits)
+
+    def init_basis_map(self, backend, dtype):
+        asarray = backend.asarray
+        self.basis_map = {
+            "0": asarray([1, 0], dtype=dtype),
+            "1": asarray([0, 1], dtype=dtype),
+        }
+
+    def init_inverse_circuit(self, circuit):
+        self.gate_tensors_inverse = []
+        gates_qubits_inverse = []
+        for gate in circuit.queue:
+            gate_qubits = gate.control_qubits + gate.target_qubits
+            gates_qubits_inverse.extend(gate_qubits)
+            required_shape = self.op_shape_from_qubits(len(gate_qubits))
+            self.gate_tensors_inverse.append(
+                (self.backend.asarray(gate.matrix()).reshape(required_shape), gate_qubits)
+            )
+        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
+
+    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
+        if backend is None:
+            backend = _require_cupy()
+        asarray = backend.asarray
+        operand_map = {
+            "I": asarray([[1, 0], [0, 1]], dtype=dtype),
+            "X": asarray([[0, 1], [1, 0]], dtype=dtype),
+            "Y": asarray([[0, -1j], [1j, 0]], dtype=dtype),
+            "Z": asarray([[1, 0], [0, -1]], dtype=dtype),
+        }
+        gates = []
+        for qubit, pauli_char in pauli_map.items():
+            operand = operand_map.get(pauli_char)
+            if operand is None:
+                raise ValueError("pauli string character must be one of I/X/Y/Z")
+            gates.append((operand, (qubit,)))
+        return gates
+
+    def expectation_operands(self, ham_gates):
+        input_bitstring = "0" * self.circuit.nqubits
+        input_operands = self._get_bitstring_tensors(input_bitstring)
+        mode_labels, qubits_frontier, next_frontier = self._init_mode_labels_from_qubits(
+            range(self.circuit.nqubits)
+        )
+        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
+            self.gate_tensors, qubits_frontier, next_frontier
+        )
+        operands = input_operands + gate_operands
+        mode_labels += gate_mode_labels
+
+        self.init_inverse_circuit(self.circuit.invert())
+        next_frontier = max(qubits_frontier.values()) + 1
+        gates_inverse = ham_gates + self.gate_tensors_inverse
+        gate_mode_labels_inverse, gate_operands_inverse = (
+            self._parse_gates_to_mode_labels_operands(
+                gates_inverse, qubits_frontier, next_frontier
+            )
+        )
+        mode_labels = (
+            mode_labels
+            + gate_mode_labels_inverse
+            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
+        )
+        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
+        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
+        operand_exp_interleave.append([])
+        return operand_exp_interleave
+
+
+def initial_mps(num_qubits, dtype):
+    _require_tensornet_mps()
+    state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
+    return [state_tensor] * num_qubits
+
+
+def mps_site_right_swap(mps_tensors, i, **kwargs):
+    _require_tensornet_mps()
+    left, _, right = contract_decompose(
+        "ipj,jqk->iqj,jpk",
+        *mps_tensors[i : i + 2],
+        algorithm=kwargs.get("algorithm", None),
+        options=kwargs.get("options", None),
+    )
+    mps_tensors[i : i + 2] = (left, right)
+    return mps_tensors
+
+
+def apply_mps_gate(mps_tensors, gate, qubits, **kwargs):
+    _require_tensornet_mps()
+    n_qubits = len(qubits)
+    if n_qubits == 1:
+        site = qubits[0]
+        mps_tensors[site] = contract(
+            "ipj,qp->iqj",
+            mps_tensors[site],
+            gate,
+            options=kwargs.get("options", None),
+        )
+    elif n_qubits == 2:
+        left, right = qubits
+        if left > right:
+            return apply_mps_gate(
+                mps_tensors, gate.transpose(1, 0, 3, 2), (right, left), **kwargs
+            )
+        if left + 1 == right:
+            a_tensor, _, b_tensor = contract_decompose(
+                "ipj,jqk,rspq->irj,jsk",
+                *mps_tensors[left : left + 2],
+                gate,
+                algorithm=kwargs.get("algorithm", None),
+                options=kwargs.get("options", None),
+            )
+            mps_tensors[left : left + 2] = (a_tensor, b_tensor)
+        else:
+            mps_site_right_swap(mps_tensors, left, **kwargs)
+            apply_mps_gate(mps_tensors, gate, (left + 1, right), **kwargs)
+            mps_site_right_swap(mps_tensors, left, **kwargs)
+    else:
+        raise NotImplementedError("Only one- and two-qubit gates supported")
+
+
+class QiboCircuitToMPS:
+    """Convert a Qibo circuit to a cuTensorNet MPS representation."""
+
+    def __init__(self, circ_qibo, gate_algo, dtype="complex128", rand_seed=0):
+        _require_cutensornet()
+        np.random.seed(rand_seed)
+        cp.random.seed(rand_seed)
+        self.num_qubits = circ_qibo.nqubits
+        self.handle = cutn.create()
+        self.dtype = dtype
+        self.mps_tensors = initial_mps(self.num_qubits, dtype=dtype)
+        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
+        for gate, qubits in circuitconvertor.gate_tensors:
+            apply_mps_gate(
+                self.mps_tensors,
+                gate,
+                qubits,
+                algorithm=gate_algo,
+                options={"handle": self.handle},
+            )
+
+    def __del__(self):
+        handle = getattr(self, "handle", None)
+        if cutn is not None and handle is not None:
+            cutn.destroy(handle)
+
+
+class MPSContractionHelper:
+    """Contract cuTensorNet MPS tensors to norms, states, or expectations."""
+
+    def __init__(self, num_qubits):
+        self.num_qubits = num_qubits
+        self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)]
+        offset = 2 * num_qubits + 1
+        self.ket_modes = [
+            (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits)
+        ]
+
+    def contract_norm(self, mps_tensors, options=None):
+        interleaved_inputs = []
+        for i, tensor in enumerate(mps_tensors):
+            interleaved_inputs.extend(
+                [tensor, self.bra_modes[i], tensor.conj(), self.ket_modes[i]]
+            )
+        interleaved_inputs.append([])
+        return self._contract(interleaved_inputs, options=options).real
+
+    def contract_state_vector(self, mps_tensors, options=None):
+        interleaved_inputs = []
+        for i, tensor in enumerate(mps_tensors):
+            interleaved_inputs.extend([tensor, self.bra_modes[i]])
+        output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes])
+        interleaved_inputs.append(output_modes)
+        return self._contract(interleaved_inputs, options=options)
+
+    def contract_expectation(
+        self, mps_tensors, operator, qubits, options=None, normalize=False
+    ):
+        interleaved_inputs = []
+        extra_mode = 3 * self.num_qubits + 2
+        operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits]
+        qubits = list(qubits)
+        for i, tensor in enumerate(mps_tensors):
+            interleaved_inputs.extend([tensor, self.bra_modes[i]])
+            ket_modes = self.ket_modes[i]
+            if i in qubits:
+                ket_modes = (ket_modes[0], extra_mode, ket_modes[2])
+                operator_modes[qubits.index(i)] = extra_mode
+                extra_mode += 1
+            interleaved_inputs.extend([tensor.conj(), ket_modes])
+        interleaved_inputs.extend([operator, tuple(operator_modes)])
+        interleaved_inputs.append([])
+        norm = self.contract_norm(mps_tensors, options=options) if normalize else 1
+        return self._contract(interleaved_inputs, options=options) / norm
+
+    def _contract(self, interleaved_inputs, options=None):
+        _require_contract()
+        path = contract_path(*interleaved_inputs, options=options)[0]
+        return contract(*interleaved_inputs, options=options, optimize={"path": path})
--- a/src/qibotn/backends/qmatchatea.py
+++ b/src/qibotn/backends/qmatchatea.py
@@ -1,6 +1,9 @@
 """Implementation of Quantum Matcha Tea backend."""

+from __future__ import annotations
+
 import re
+import time
 from dataclasses import dataclass

 import numpy as np
@@ -9,8 +12,11 @@ import qmatchatea
 import qtealeaves
 from qibo.backends import NumpyBackend
 from qibo.config import raise_error
+from qmatchatea.utils import MPISettings

 from qibotn.backends.abstract import QibotnBackend
+from qibotn.benchmark_cases import exact_pauli_sum
+from qibotn.observables import check_observable
 from qibotn.result import TensorNetworkResult


@@ -38,6 +44,14 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
        trunc_tracking_mode: str = "C",
        svd_control: str = "A",
        ini_bond_dimension: int = 1,
+        tensor_module: str = "numpy",
+        compile_circuit: bool = False,
+        cache_gate_tensors: bool = True,
+        track_memory: bool = False,
+        mpi_approach: str = "SR",
+        mpi_num_procs: int = 1,
+        mpi_where_barriers: int = -1,
+        mpi_isometrization: int = -1,
    ):
        """Configure TN simulation given Quantum Matcha Tea interface.

@@ -75,6 +89,18 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            ini_bond_dimension=ini_bond_dimension,
        )
        self.ansatz = ansatz
+        self.tensor_module = tensor_module
+        self.compile_circuit = compile_circuit
+        self.cache_gate_tensors = cache_gate_tensors
+        self.track_memory = track_memory
+        self.mpi_settings = MPISettings(
+            mpi_approach=mpi_approach,
+            num_procs=mpi_num_procs,
+            where_barriers=mpi_where_barriers,
+            isometrization=mpi_isometrization,
+        )
+        if hasattr(self, "qmatchatea_backend"):
+            self._setup_backend_specifics()

    def _setup_backend_specifics(self):
        """Configure qmatchatea QCBackend object."""
@@ -88,12 +114,15 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            else "Z" if self.precision == "double" else "A"
        )

-        # TODO: once MPI is available for Python, integrate it here
        self.qmatchatea_backend = qmatchatea.QCBackend(
            precision=qmatchatea_precision,
            device=qmatchatea_device,
            ansatz=self.ansatz,
+            tensor_module=self.tensor_module,
+            mpi_settings=self.mpi_settings,
        )
+        self.qmatchatea_backend.cache_gate_tensors = self.cache_gate_tensors
+        self.qmatchatea_backend.track_memory = self.track_memory

    def execute_circuit(
        self,
@@ -193,7 +222,7 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            statevector=statevector,
        )

-    def expectation(self, circuit, observable):
+    def expectation(self, circuit, observable, preprocess=True, compile_circuit=None):
        """Compute the expectation value of a Qibo-friendly ``observable`` on
        the Tensor Network constructed from a Qibo ``circuit``.

@@ -216,8 +245,14 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            simulation setup.
        """

+        observable = check_observable(observable, circuit.nqubits)
+
        # From Qibo to Qiskit
-        circuit = self._qibocirc_to_qiskitcirc(circuit)
+        circuit = self._qibocirc_to_qiskitcirc(
+            circuit,
+            preprocess=preprocess,
+            compile_circuit=compile_circuit,
+        )
        run_qk_params = qmatchatea.preprocessing.qk_transpilation_params(False)

        operators = qmatchatea.QCOperators()
@@ -234,19 +269,37 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            operators=operators,
        )

+        if self.qmatchatea_backend.mpi_approach != "SR":
+            from qtealeaves.tooling.mpisupport import MPI
+
+            if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
+                return np.nan
+
        return np.real(results.observables["custom_hamiltonian"])

-    def _qibocirc_to_qiskitcirc(self, qibo_circuit) -> qiskit.QuantumCircuit:
+    def _qibocirc_to_qiskitcirc(
+        self, qibo_circuit, preprocess=True, compile_circuit=None
+    ) -> qiskit.QuantumCircuit:
        """Convert a Qibo Circuit into a Qiskit Circuit."""
        # Convert the circuit to QASM 2.0 to qiskit
        qasm_circuit = qibo_circuit.to_qasm()
        qiskit_circuit = qiskit.QuantumCircuit.from_qasm_str(qasm_circuit)

+        if compile_circuit is None:
+            compile_circuit = self.compile_circuit
+
+        if not preprocess:
+            if compile_circuit:
+                qiskit_circuit = qmatchatea.tensor_compiler(qiskit_circuit)
+            return qiskit_circuit
+
        # Transpile the circuit to adapt it to the linear structure of the MPS,
        # with the constraint of having only the gates basis_gates
        qiskit_circuit = qmatchatea.preprocessing.preprocess(
            qiskit_circuit,
-            qk_params=qmatchatea.preprocessing.qk_transpilation_params(),
+            qk_params=qmatchatea.preprocessing.qk_transpilation_params(
+                tensor_compiler=compile_circuit
+            ),
        )
        return qiskit_circuit

@@ -315,3 +368,207 @@ class QMatchaTeaBackend(QibotnBackend, NumpyBackend):
            use_itpo=False,
        )
        return obs_sum
+
+
+@dataclass(frozen=True)
+class QMatchaTeaExpectationResult:
+    value: float
+    seconds: float
+    backend: object
+
+
+@dataclass(frozen=True)
+class QMatchaTeaBuildResult:
+    backend: object
+    build_seconds: float
+
+
+@dataclass(frozen=True)
+class QMatchaTeaSvdControlResult:
+    ctrl: str
+    contract_singvals: str
+    status: str
+    median_ms: float
+    min_ms: float
+    rel_error: float | None
+    kept: int | None
+    error: str
+
+
+def make_qmatchatea_backend(
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    svd_control="E!",
+    compile_circuit=True,
+    track_memory=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+):
+    backend = QMatchaTeaBackend()
+    backend.configure_tn_simulation(
+        ansatz="MPS",
+        max_bond_dimension=bond,
+        cut_ratio=cut_ratio,
+        svd_control=svd_control,
+        tensor_module=tensor_module,
+        compile_circuit=compile_circuit,
+        track_memory=track_memory,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+    )
+    return backend
+
+
+def build_qmatchatea_backend(
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    svd_control="E!",
+    compile_circuit=True,
+    track_memory=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+):
+    start = time.perf_counter()
+    backend = make_qmatchatea_backend(
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        svd_control=svd_control,
+        compile_circuit=compile_circuit,
+        track_memory=track_memory,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+    )
+    return QMatchaTeaBuildResult(backend=backend, build_seconds=time.perf_counter() - start)
+
+
+def exact_mps_expectation(circuit, observable, nqubits):
+    if isinstance(observable, dict) and "terms" in observable:
+        terms = [
+            (
+                term["coefficient"],
+                tuple((name, site) for name, site in term["operators"]),
+            )
+            for term in observable["terms"]
+        ]
+        return exact_pauli_sum(circuit, terms, nqubits)
+
+    hamiltonian = check_observable(observable, nqubits)
+    return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real)
+
+
+def run_qmatchatea_expectation(
+    circuit,
+    observable,
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    svd_control="E!",
+    compile_circuit=True,
+    preprocess=True,
+    track_memory=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+):
+    built = build_qmatchatea_backend(
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        svd_control=svd_control,
+        compile_circuit=compile_circuit,
+        track_memory=track_memory,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+    )
+    start = time.perf_counter()
+    value = built.backend.expectation(
+        circuit,
+        observable,
+        preprocess=preprocess,
+        compile_circuit=compile_circuit,
+    )
+    return QMatchaTeaExpectationResult(
+        value=float(np.real(value)),
+        seconds=time.perf_counter() - start,
+        backend=built.backend,
+    )
+
+
+def benchmark_qmatchatea_svd_control(matrix, *, ctrl, max_bond, contract_singvals, repeats):
+    import gc
+    import statistics
+
+    import torch
+
+    from qredtea.torchapi import QteaTorchTensor
+
+    conv = qmatchatea.QCConvergenceParameters(
+        max_bond_dimension=max_bond,
+        cut_ratio=0.0,
+        svd_ctrl=ctrl,
+    )
+    qtensor = QteaTorchTensor.from_elem_array(matrix, dtype=matrix.dtype, device="cpu")
+
+    times = []
+    rel_error = None
+    kept = None
+    status = "ok"
+    error = ""
+
+    for i in range(repeats):
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        t0 = time.perf_counter()
+        try:
+            left, right, singvals, _ = qtensor.split_svd(
+                [0],
+                [1],
+                contract_singvals=contract_singvals,
+                conv_params=conv,
+            )
+        except Exception as exc:  # noqa: BLE001
+            status = "error"
+            error = repr(exc)
+            break
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        times.append(time.perf_counter() - t0)
+
+        if i == repeats - 1:
+            left_matrix = left.elem.reshape(matrix.shape[0], -1)
+            right_matrix = right.elem.reshape(-1, matrix.shape[1])
+            recon = left_matrix @ right_matrix
+            rel_error = (
+                torch.linalg.vector_norm(matrix - recon)
+                / torch.linalg.vector_norm(matrix)
+            ).item()
+            kept = int(singvals.numel())
+
+    return QMatchaTeaSvdControlResult(
+        ctrl=ctrl,
+        contract_singvals=contract_singvals,
+        status=status,
+        median_ms=float("nan") if not times else statistics.median(times) * 1000,
+        min_ms=float("nan") if not times else min(times) * 1000,
+        rel_error=rel_error,
+        kept=kept,
+        error=error,
+    )
--- a/src/qibotn/backends/quimb.py
+++ b/src/qibotn/backends/quimb.py
--- a/src/qibotn/backends/vidal.py
+++ b/src/qibotn/backends/vidal.py
@@ -0,0 +1,986 @@
+"""Vidal/TEBD fast-path backend with qmatchatea fallback.
+
+This backend targets MPS-friendly one-dimensional circuits: one-qubit gates and
+adjacent two-qubit gates, measured with Pauli-sum expectation values. Unsupported
+features fall back to the qmatchatea backend so the public behavior remains
+usable while the fast path is expanded.
+"""
+
+from __future__ import annotations
+
+import re
+import time
+from dataclasses import dataclass
+
+import numpy as np
+from qibo.backends import NumpyBackend
+
+from qibotn.backends.abstract import QibotnBackend
+from qibotn.backends.qmatchatea import QMatchaTeaBackend
+from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+from qibotn.backends.vidal_tebd import VidalTEBDExecutor, _gate_sites
+from qibotn.observables import check_observable
+
+
+def _symbolic_hamiltonian_to_pauli_terms(hamiltonian):
+    terms = []
+    factor_pattern = re.compile(r"([^\d]+)(\d+)")
+    for term in hamiltonian.terms:
+        ops = []
+        for factor in term.factors:
+            match = factor_pattern.match(str(factor))
+            if match is None:
+                raise ValueError(f"Unsupported observable factor {factor!r}.")
+            name = match.group(1).upper()
+            if name not in ("I", "X", "Y", "Z"):
+                raise ValueError(f"Unsupported observable operator {name!r}.")
+            if name != "I":
+                ops.append((name, int(match.group(2))))
+        terms.append((complex(term.coefficient), tuple(ops)))
+    return terms
+
+
+def _symbolic_hamiltonian_to_operator_terms(hamiltonian):
+    terms = []
+    factor_pattern = re.compile(r"([^\d]+)(\d+)")
+    paulis = {
+        "I": np.eye(2, dtype=np.complex128),
+        "X": np.array([[0, 1], [1, 0]], dtype=np.complex128),
+        "Y": np.array([[0, -1j], [1j, 0]], dtype=np.complex128),
+        "Z": np.array([[1, 0], [0, -1]], dtype=np.complex128),
+    }
+    for term in hamiltonian.terms:
+        ops_by_site = {}
+        for factor in term.factors:
+            site = getattr(factor, "target_qubit", None)
+            matrix = getattr(factor, "matrix", None)
+            if site is None or matrix is None:
+                match = factor_pattern.match(str(factor))
+                if match is None:
+                    raise ValueError(f"Unsupported observable factor {factor!r}.")
+                name = match.group(1).upper()
+                if name not in paulis:
+                    raise ValueError(f"Unsupported observable operator {name!r}.")
+                site = int(match.group(2))
+                matrix = paulis[name]
+            matrix = np.asarray(matrix, dtype=np.complex128)
+            site = int(site)
+            if site in ops_by_site:
+                ops_by_site[site] = ops_by_site[site] @ matrix
+            else:
+                ops_by_site[site] = matrix
+        terms.append((complex(term.coefficient), tuple(ops_by_site.items())))
+    return terms
+
+
+def _dense_operator_to_product_terms(coeff, qubits, matrix):
+    """Expand a dense k-local operator into product-matrix terms.
+
+    The dense matrix basis is ordered by the provided ``qubits`` sequence.  For
+    example, ``qubits=[2, 5]`` means matrix rows/columns are ordered as
+    ``|q2 q5>``.
+    """
+    qubits = tuple(int(qubit) for qubit in qubits)
+    if len(set(qubits)) != len(qubits):
+        raise ValueError("Dense observable qubits must be unique.")
+    matrix = np.asarray(matrix, dtype=np.complex128)
+    dim = 2 ** len(qubits)
+    if matrix.shape != (dim, dim):
+        raise ValueError(
+            "Dense observable matrix shape must be "
+            f"({dim}, {dim}) for {len(qubits)} qubits."
+        )
+
+    units = [
+        np.array([[1, 0], [0, 0]], dtype=np.complex128),
+        np.array([[0, 1], [0, 0]], dtype=np.complex128),
+        np.array([[0, 0], [1, 0]], dtype=np.complex128),
+        np.array([[0, 0], [0, 1]], dtype=np.complex128),
+    ]
+    terms = []
+    for row in range(dim):
+        for col in range(dim):
+            value = complex(coeff) * complex(matrix[row, col])
+            if value == 0:
+                continue
+            ops = []
+            for offset, site in enumerate(qubits):
+                shift = len(qubits) - offset - 1
+                out_bit = (row >> shift) & 1
+                in_bit = (col >> shift) & 1
+                ops.append((site, units[2 * out_bit + in_bit]))
+            terms.append((value, tuple(ops)))
+    return terms
+
+
+def _dense_observable_to_operator_terms(observable):
+    if not isinstance(observable, dict):
+        return None
+
+    if "matrix" in observable:
+        terms = [observable]
+    else:
+        terms = observable.get("dense_terms")
+        if terms is None:
+            raw_terms = observable.get("terms")
+            if not raw_terms or not any("matrix" in term for term in raw_terms):
+                return None
+            terms = raw_terms
+
+    operator_terms = []
+    for term in terms:
+        if "matrix" not in term:
+            raise ValueError("Dense observable terms must include a matrix.")
+        qubits = term.get("qubits", term.get("sites"))
+        if qubits is None:
+            raise ValueError("Dense observable terms must include qubits or sites.")
+        operator_terms.extend(
+            _dense_operator_to_product_terms(
+                term.get("coefficient", 1.0),
+                qubits,
+                term["matrix"],
+            )
+        )
+    return operator_terms
+
+
+def _operator_terms_to_mpo(terms, nqubits):
+    """Build an exact direct-sum MPO for product-operator terms.
+
+    This intentionally favors correctness and generality over compression: an
+    ``m``-term sum becomes an MPO with bond dimension ``m``.  Local Hamiltonians
+    can be compressed later without changing the public expectation path.
+    """
+    identity = np.eye(2, dtype=np.complex128)
+    expanded_terms = []
+    for coeff, ops in terms:
+        local_ops = [identity for _ in range(nqubits)]
+        for site, matrix in ops:
+            site = int(site)
+            if site < 0 or site >= nqubits:
+                raise ValueError(f"Observable site {site} is outside the circuit.")
+            matrix = np.asarray(matrix, dtype=np.complex128)
+            if matrix.shape != (2, 2):
+                raise ValueError("Only qubit local operators with shape (2, 2) are supported.")
+            local_ops[site] = matrix
+        expanded_terms.append((complex(coeff), local_ops))
+
+    if not expanded_terms:
+        raise ValueError("Cannot build an MPO from an empty observable.")
+
+    bond_dim = len(expanded_terms)
+    mpo = []
+    for site in range(nqubits):
+        left_dim = 1 if site == 0 else bond_dim
+        right_dim = 1 if site == nqubits - 1 else bond_dim
+        tensor = np.zeros((left_dim, 2, 2, right_dim), dtype=np.complex128)
+        for term_index, (coeff, local_ops) in enumerate(expanded_terms):
+            left = 0 if site == 0 else term_index
+            right = 0 if site == nqubits - 1 else term_index
+            op = coeff * local_ops[site] if site == 0 else local_ops[site]
+            tensor[left, :, :, right] += op
+        mpo.append(tensor)
+    return mpo
+
+
+def _observable_mpo_tensors(observable, nqubits=None):
+    if isinstance(observable, dict):
+        if "mpo_tensors" in observable:
+            return observable["mpo_tensors"]
+        if "mpo" in observable:
+            return observable["mpo"]
+        if nqubits is not None:
+            terms = _dense_observable_to_operator_terms(observable)
+            if terms is not None:
+                return _operator_terms_to_mpo(terms, nqubits)
+    return None
+
+
+def _unsupported_reason(circuit):
+    for gate in circuit.queue:
+        name = getattr(gate, "name", gate.__class__.__name__)
+        sites = _gate_sites(gate)
+        if not sites:
+            return f"gate {name} has no target qubits"
+        if len(sites) > 2:
+            return f"gate {name} acts on {len(sites)} qubits"
+        if len(sites) == 2 and abs(sites[0] - sites[1]) != 1:
+            return f"gate {name} is non-adjacent on qubits {sites}"
+        if not hasattr(gate, "matrix"):
+            return f"gate {name} does not expose a matrix"
+    return None
+
+
+def _can_route_non_adjacent(circuit):
+    """True if the circuit's only unsupported feature is non-adjacent 2Q gates.
+
+    SWAP routing can fix non-adjacent gates at compile time.  Multi-qubit
+    gates and matrix-less gates are truly unsupported.
+    """
+    for gate in circuit.queue:
+        sites = _gate_sites(gate)
+        if not sites:
+            return False
+        if len(sites) > 2:
+            return False
+        if not hasattr(gate, "matrix"):
+            return False
+    return True
+
+
+@dataclass
+class _PreparedCircuit:
+    nqubits: int
+    queue: list
+
+
+def _decompose_gate_for_mps(gate, nqubits, stack=()):
+    sites = _gate_sites(gate)
+    if len(sites) <= 2:
+        return [gate]
+    if gate in stack or not hasattr(gate, "decompose"):
+        name = getattr(gate, "name", gate.__class__.__name__)
+        raise ValueError(f"gate {name} acts on {len(sites)} qubits")
+
+    free = [qubit for qubit in range(nqubits) if qubit not in sites]
+    try:
+        decomposed = gate.decompose(*free, use_toffolis=False, method="standard")
+    except TypeError:
+        decomposed = gate.decompose(*free)
+    if not decomposed or decomposed == [gate]:
+        name = getattr(gate, "name", gate.__class__.__name__)
+        raise ValueError(f"gate {name} could not be decomposed for Vidal MPS")
+
+    result = []
+    for item in decomposed:
+        result.extend(_decompose_gate_for_mps(item, nqubits, stack + (gate,)))
+    return result
+
+
+def _prepare_circuit_for_mps(circuit, decompose=True):
+    if not decompose:
+        return circuit
+    queue = []
+    for gate in circuit.queue:
+        queue.extend(_decompose_gate_for_mps(gate, circuit.nqubits))
+    return _PreparedCircuit(nqubits=circuit.nqubits, queue=queue)
+
+
+@dataclass
+class VidalBackend(QibotnBackend, NumpyBackend):
+    """QiboTN backend using Vidal/TEBD when possible.
+
+    The fast path supports:
+    - one-qubit gates with ``gate.matrix()``;
+    - adjacent two-qubit gates with ``gate.matrix()``;
+    - Qibo ``SymbolicHamiltonian`` / qibotn dict Pauli-sum expectation values;
+    - MPI chain segmentation through ``mpi_approach="CT"``.
+
+    Unsupported operations are delegated to qmatchatea.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.name = "qibotn"
+        self.platform = "vidal"
+        self.precision = "double"
+        self.rank = 0
+        self.last_truncation_error = 0.0
+        self.last_max_truncation_error = 0.0
+        self.configure_tn_simulation()
+
+    def configure_tn_simulation(
+        self,
+        ansatz: str = "MPS",
+        max_bond_dimension: int | None = 10,
+        cut_ratio: float | None = 1e-9,
+        trunc_tracking_mode: str = "C",
+        svd_control: str = "E!",
+        ini_bond_dimension: int = 1,
+        tensor_module: str = "torch",
+        compile_circuit: bool = False,
+        cache_gate_tensors: bool = True,
+        track_memory: bool = False,
+        mpi_approach: str = "SR",
+        mpi_num_procs: int = 1,
+        mpi_where_barriers: int = -1,
+        mpi_isometrization: int = -1,
+        mpi_term_batch_size: int | None = None,
+        fallback: bool = True,
+    ):
+        self.ansatz = ansatz
+        self.max_bond_dimension = max_bond_dimension
+        self.cut_ratio = cut_ratio
+        self.trunc_tracking_mode = trunc_tracking_mode
+        self.svd_control = svd_control
+        self.ini_bond_dimension = ini_bond_dimension
+        self.tensor_module = tensor_module
+        self.compile_circuit = compile_circuit
+        self.cache_gate_tensors = cache_gate_tensors
+        self.track_memory = track_memory
+        self.mpi_approach = mpi_approach.upper()
+        self.mpi_num_procs = mpi_num_procs
+        self.mpi_where_barriers = mpi_where_barriers
+        self.mpi_isometrization = mpi_isometrization
+        self.mpi_term_batch_size = mpi_term_batch_size
+        self.fallback = fallback
+        self._fallback_backend = None
+
+    def _setup_backend_specifics(self):
+        return None
+
+    def _qmatchatea_fallback(self):
+        if self._fallback_backend is None:
+            backend = QMatchaTeaBackend()
+            backend.configure_tn_simulation(
+                ansatz=self.ansatz,
+                max_bond_dimension=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                trunc_tracking_mode=self.trunc_tracking_mode,
+                svd_control=self.svd_control,
+                ini_bond_dimension=self.ini_bond_dimension,
+                tensor_module=self.tensor_module,
+                compile_circuit=self.compile_circuit,
+                cache_gate_tensors=self.cache_gate_tensors,
+                track_memory=self.track_memory,
+                mpi_approach=self.mpi_approach,
+                mpi_num_procs=self.mpi_num_procs,
+                mpi_where_barriers=self.mpi_where_barriers,
+                mpi_isometrization=self.mpi_isometrization,
+            )
+            self._fallback_backend = backend
+        return self._fallback_backend
+
+    def _fallback_or_raise(self, reason):
+        if not self.fallback:
+            raise NotImplementedError(reason)
+        return self._qmatchatea_fallback()
+
+    def _preprocess_circuit(self, circuit, compile_circuit):
+        """Decompose unsupported multi-qubit gates for the local Vidal path."""
+        return _prepare_circuit_for_mps(circuit, decompose=True)
+
+    def _run_fast_executor(self, circuit, compile_circuit=True):
+        if self.mpi_approach == "CT":
+            from mpi4py import MPI
+
+            self.rank = MPI.COMM_WORLD.Get_rank()
+            executor = SegmentVidalMPIExecutor(
+                nqubits=circuit.nqubits,
+                max_bond=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                tensor_module=self.tensor_module,
+                comm=MPI.COMM_WORLD,
+            )
+        else:
+            self.rank = 0
+            executor = VidalTEBDExecutor(
+                nqubits=circuit.nqubits,
+                max_bond=self.max_bond_dimension,
+                cut_ratio=self.cut_ratio,
+                tensor_module=self.tensor_module,
+            )
+        executor.run_circuit(circuit, compile_circuit=compile_circuit)
+        return executor
+
+    def expectation(self, circuit, observable, preprocess=True, compile_circuit=None):
+        if self.ansatz.upper() != "MPS":
+            backend = self._fallback_or_raise("VidalBackend supports only MPS.")
+            return backend.expectation(circuit, observable, preprocess, compile_circuit)
+
+        original_circuit = circuit
+        if compile_circuit is None:
+            compile_circuit = self.compile_circuit
+
+        if preprocess:
+            try:
+                circuit = self._preprocess_circuit(circuit, compile_circuit)
+            except Exception as exc:
+                backend = self._fallback_or_raise(
+                    f"VidalBackend preprocessing failed: {exc}"
+                )
+                return backend.expectation(
+                    original_circuit, observable, preprocess, compile_circuit
+                )
+
+        reason = _unsupported_reason(circuit)
+        if reason is not None:
+            # Non-adjacent gates can be routed at compile time
+            if compile_circuit and _can_route_non_adjacent(circuit):
+                pass  # proceed with Vidal + SWAP routing
+            else:
+                backend = self._fallback_or_raise(reason)
+                return backend.expectation(
+                    original_circuit, observable, preprocess, compile_circuit
+                )
+
+        executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit)
+        self.last_truncation_error = float(
+            executor.global_truncation_error()
+            if hasattr(executor, "global_truncation_error")
+            else executor.truncation_error
+        )
+        self.last_max_truncation_error = float(
+            executor.global_max_truncation_error()
+            if hasattr(executor, "global_max_truncation_error")
+            else executor.max_truncation_error
+        )
+
+        mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
+        if mpo_tensors is not None:
+            if self.mpi_approach == "CT":
+                value = executor.expectation_mpo_root(mpo_tensors)
+                from qtealeaves.tooling.mpisupport import MPI
+
+                if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
+                    return np.nan
+                return value
+            return executor.expectation_mpo(mpo_tensors)
+
+        hamiltonian = check_observable(observable, circuit.nqubits)
+        try:
+            terms = _symbolic_hamiltonian_to_operator_terms(hamiltonian)
+        except ValueError as exc:
+            backend = self._fallback_or_raise(str(exc))
+            return backend.expectation(
+                original_circuit, observable, preprocess, compile_circuit
+            )
+
+        mpo_tensors = _operator_terms_to_mpo(terms, circuit.nqubits)
+        if self.mpi_approach == "CT":
+            value = executor.expectation_mpo_root(mpo_tensors)
+            from qtealeaves.tooling.mpisupport import MPI
+
+            if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
+                return np.nan
+            return value
+        return executor.expectation_mpo(mpo_tensors)
+
+    def execute_circuit(
+        self,
+        circuit,
+        initial_state=None,
+        nshots=None,
+        prob_type=None,
+        return_array=False,
+        **prob_kwargs,
+    ):
+        backend = self._fallback_or_raise(
+            "VidalBackend.execute_circuit is delegated to qmatchatea."
+        )
+        return backend.execute_circuit(
+            circuit,
+            initial_state=initial_state,
+            nshots=nshots,
+            prob_type=prob_type,
+            return_array=return_array,
+            **prob_kwargs,
+        )
+
+
+@dataclass(frozen=True)
+class VidalExpectationResult:
+    value: float
+    seconds: float
+    backend: object
+
+
+@dataclass(frozen=True)
+class VidalBackendComparisonResult:
+    circuit: object
+    observable: object
+    exact: float | None
+    qmatchatea: VidalExpectationResult | None
+    vidal: VidalExpectationResult
+    qmatchatea_error: float | None
+    vidal_error: float | None
+
+
+@dataclass(frozen=True)
+class VidalProfileResult:
+    value: float
+    trace_path: object
+    table_path: object
+    table: str
+
+
+def make_vidal_backend(
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    compile_circuit=False,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+    mpi_term_batch_size=None,
+    fallback=True,
+):
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        compile_circuit=compile_circuit,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+        mpi_term_batch_size=mpi_term_batch_size,
+        fallback=fallback,
+    )
+    return backend
+
+
+def run_vidal_expectation(
+    circuit,
+    observable,
+    *,
+    bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    compile_circuit=False,
+    preprocess=True,
+    mpi_approach="SR",
+    mpi_num_procs=1,
+    mpi_where_barriers=-1,
+    mpi_isometrization=-1,
+    mpi_term_batch_size=None,
+    fallback=True,
+):
+    backend = make_vidal_backend(
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        compile_circuit=compile_circuit,
+        mpi_approach=mpi_approach,
+        mpi_num_procs=mpi_num_procs,
+        mpi_where_barriers=mpi_where_barriers,
+        mpi_isometrization=mpi_isometrization,
+        mpi_term_batch_size=mpi_term_batch_size,
+        fallback=fallback,
+    )
+    start = time.perf_counter()
+    value = backend.expectation(
+        circuit,
+        observable,
+        preprocess=preprocess,
+        compile_circuit=compile_circuit,
+    )
+    return VidalExpectationResult(
+        value=float(np.real(value)),
+        seconds=time.perf_counter() - start,
+        backend=backend,
+    )
+
+
+def run_segmented_vidal_ring_xz(
+    circuit,
+    *,
+    max_bond=10,
+    cut_ratio=1e-9,
+    tensor_module="torch",
+    comm,
+):
+    from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz
+
+    start = time.perf_counter()
+    value, timings = run_segment_vidal_mpi_ring_xz(
+        circuit,
+        max_bond=max_bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        comm=comm,
+    )
+    return VidalExpectationResult(
+        value=float(np.real(value)),
+        seconds=time.perf_counter() - start,
+        backend=timings,
+    )
+
+
+def compare_vidal_backend_qmatchatea(
+    circuit,
+    observable,
+    *,
+    bond=512,
+    cut_ratio=1e-12,
+    tensor_module="torch",
+    exact=None,
+    skip_qmatchatea=False,
+    qmatchatea_compile_circuit=True,
+    qmatchatea_svd_control="E!",
+    vidal_compile_circuit=True,
+    vidal_fallback=True,
+):
+    qmatchatea_result = None
+    if not skip_qmatchatea:
+        qmatchatea_backend = QMatchaTeaBackend()
+        qmatchatea_backend.configure_tn_simulation(
+            ansatz="MPS",
+            max_bond_dimension=bond,
+            cut_ratio=cut_ratio,
+            svd_control=qmatchatea_svd_control,
+            tensor_module=tensor_module,
+            compile_circuit=qmatchatea_compile_circuit,
+            track_memory=False,
+        )
+        start = time.perf_counter()
+        qmatchatea_value = qmatchatea_backend.expectation(
+            circuit,
+            observable,
+            preprocess=False,
+            compile_circuit=qmatchatea_compile_circuit,
+        )
+        qmatchatea_result = VidalExpectationResult(
+            value=float(np.real(qmatchatea_value)),
+            seconds=time.perf_counter() - start,
+            backend=qmatchatea_backend,
+        )
+
+    vidal_backend = VidalBackend()
+    vidal_backend.configure_tn_simulation(
+        ansatz="MPS",
+        max_bond_dimension=bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        compile_circuit=vidal_compile_circuit,
+        fallback=vidal_fallback,
+    )
+    start = time.perf_counter()
+    vidal_value = vidal_backend.expectation(
+        circuit,
+        observable,
+        preprocess=False,
+        compile_circuit=vidal_compile_circuit,
+    )
+    vidal_result = VidalExpectationResult(
+        value=float(np.real(vidal_value)),
+        seconds=time.perf_counter() - start,
+        backend=vidal_backend,
+    )
+
+    qmatchatea_error = None
+    vidal_error = None
+    if exact is not None:
+        if qmatchatea_result is not None:
+            qmatchatea_error = abs(qmatchatea_result.value - exact)
+        vidal_error = abs(vidal_result.value - exact)
+
+    return VidalBackendComparisonResult(
+        circuit=circuit,
+        observable=observable,
+        exact=exact,
+        qmatchatea=qmatchatea_result,
+        vidal=vidal_result,
+        qmatchatea_error=qmatchatea_error,
+        vidal_error=vidal_error,
+    )
+
+
+def profile_vidal_expectation(
+    circuit,
+    observable,
+    *,
+    bond=512,
+    cut_ratio=1e-12,
+    torch_threads=32,
+    trace_path,
+    table_path,
+    profile_memory=False,
+    rows=60,
+):
+    import torch
+    from torch.profiler import ProfilerActivity, profile
+
+    from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation
+
+    torch.set_num_threads(torch_threads)
+    config = ExpectationConfig(
+        ansatz="mps",
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module="torch",
+        torch_threads=torch_threads,
+    )
+
+    with profile(
+        activities=[ProfilerActivity.CPU],
+        record_shapes=profile_memory,
+        profile_memory=profile_memory,
+        with_stack=profile_memory,
+    ) as prof:
+        result = run_cpu_expectation(circuit, observable, config)
+
+    table = (
+        f"expval={result.value:.16e}\n\n"
+        f"# sorted by self_cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=rows)}\n\n"
+        f"# sorted by cpu_time_total\n"
+        f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=rows)}\n"
+    )
+    table_path.parent.mkdir(parents=True, exist_ok=True)
+    table_path.write_text(table, encoding="utf-8")
+    prof.export_chrome_trace(str(trace_path))
+    return VidalProfileResult(
+        value=result.value,
+        trace_path=trace_path,
+        table_path=table_path,
+        table=table,
+    )
+
+
+CONTEST_MPS_BONDS = {"main1": 512, "main2": 1024, "strong": 2048}
+CONTEST_VIDAL_OBSERVABLES = (
+    "boundary_ZZ_q1",
+    "boundary_ZZ_q2",
+    "boundary_ZZ_q3",
+    "long_Z_5_sites",
+    "mixed_XZYZX",
+    "ring_xz",
+    "open_zz",
+    "range2_xx",
+    "complex_iZ0",
+    "dense2_mid",
+    "dense3_spread",
+)
+
+
+def run_contest_mps_case(
+    case_name="main1",
+    *,
+    observables=None,
+    obs_filter="",
+    nqubits=None,
+    nlayers=None,
+    bond="case-default",
+    cut_ratio=1e-12,
+    seed=None,
+    torch_threads=8,
+    exact=False,
+    exact_max_qubits=24,
+):
+    """Run a shared contest-style Vidal/MPS expectation case."""
+    from qibotn.contest_cases import CASES, build_contest_circuit, build_contest_observable
+    from qibotn.expectation_runner import exact_for_observable
+    from qibotn.torch_utils import set_torch_threads
+
+    from mpi4py import MPI
+
+    set_torch_threads(torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    case = CASES[case_name]
+    nqubits = case.nqubits if nqubits is None else nqubits
+    nlayers = case.nlayers if nlayers is None else nlayers
+    seed = case.seed if seed is None else seed
+    if bond == "case-default":
+        bond = CONTEST_MPS_BONDS.get(case_name, 1024)
+    if observables is None:
+        observables = tuple(x.strip() for x in obs_filter.split(",") if x.strip()) or case.observables
+
+    circuit = build_contest_circuit(case.circuit_kind, nqubits, nlayers, seed)
+    records = []
+    for obs_name in observables:
+        observable = build_contest_observable(obs_name, nqubits, seed)
+        exact_value = None
+        if exact and rank == 0:
+            if nqubits > exact_max_qubits:
+                raise ValueError(f"exact reference is limited to {exact_max_qubits} qubits.")
+            exact_value = exact_for_observable(circuit, observable, nqubits)
+
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=bond,
+            cut_ratio=cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+
+        comm.Barrier()
+        start = time.perf_counter()
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=True,
+            compile_circuit=False,
+        )
+        seconds = time.perf_counter() - start
+        if rank == 0:
+            records.append(
+                {
+                    "case": case,
+                    "observable": obs_name,
+                    "value": value,
+                    "seconds": seconds,
+                    "exact": exact_value,
+                    "abs_error": None if exact_value is None else abs(value - exact_value),
+                    "rel_error": (
+                        None
+                        if exact_value is None
+                        else abs(value - exact_value) / max(abs(exact_value), 1e-15)
+                    ),
+                    "truncation_error": backend.last_truncation_error,
+                    "max_truncation_error": backend.last_max_truncation_error,
+                }
+            )
+    return records
+
+
+def run_vidal_mpi_contest_case(
+    *,
+    label,
+    kind,
+    nqubits,
+    nlayers,
+    bond,
+    cut_ratio,
+    seed,
+    torch_threads,
+    obs_filter="",
+):
+    """Run the direct Vidal MPI contest observable sweep."""
+    from qibotn.contest_cases import build_contest_circuit, build_contest_observable
+    from qibotn.torch_utils import set_torch_threads
+
+    from mpi4py import MPI
+
+    del label
+    set_torch_threads(torch_threads)
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    circuit = build_contest_circuit(kind, nqubits, nlayers, seed)
+    names = CONTEST_VIDAL_OBSERVABLES
+    if obs_filter:
+        wanted = set(obs_filter.split(","))
+        names = tuple(name for name in names if name in wanted)
+        if not names:
+            raise ValueError(f"obs_filter matched no observables: {obs_filter!r}")
+
+    records = []
+    for obs_name in names:
+        observable = build_contest_observable(obs_name, nqubits, seed)
+        backend = VidalBackend()
+        backend.configure_tn_simulation(
+            max_bond_dimension=bond,
+            cut_ratio=cut_ratio,
+            tensor_module="torch",
+            mpi_approach="CT",
+            mpi_num_procs=size,
+            fallback=False,
+        )
+        comm.Barrier()
+        start = time.perf_counter()
+        value = backend.expectation(
+            circuit,
+            observable,
+            preprocess=True,
+            compile_circuit=False,
+        )
+        seconds = time.perf_counter() - start
+        if rank == 0:
+            records.append(
+                {
+                    "observable": obs_name,
+                    "value": value,
+                    "seconds": seconds,
+                    "truncation_error": backend.last_truncation_error,
+                    "max_truncation_error": backend.last_max_truncation_error,
+                }
+            )
+    return records
+
+
+def build_vidal_validation_circuit(kind, nqubits, nlayers, seed):
+    """Build the circuit family used by Vidal correctness checks."""
+    from qibotn.benchmark_cases import build_circuit
+
+    aliases = {"brickwall": "brickwall_cnot"}
+    return build_circuit(aliases.get(kind, kind), nqubits, nlayers, seed)
+
+
+def run_vidal_validation_cases(
+    *,
+    nqubits=16,
+    nlayers=6,
+    bond=512,
+    seed=42,
+    tensor_module="torch",
+    torch_threads=32,
+    mpi=False,
+    circuits=("brickwall", "reversed_cnot", "rx_ry_cz"),
+    observables=("ring_xz", "open_zz", "mixed_local"),
+):
+    """Run Vidal/TEBD correctness checks against dense statevector references."""
+    from qibotn.benchmark_cases import exact_pauli_sum, observable_terms
+    from qibotn.backends.vidal_tebd import VidalTEBDExecutor
+    from qibotn.torch_utils import set_torch_threads
+
+    set_torch_threads(torch_threads)
+    comm = None
+    rank = 0
+    if mpi:
+        from mpi4py import MPI
+
+        from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
+
+        comm = MPI.COMM_WORLD
+        rank = comm.Get_rank()
+    else:
+        SegmentVidalMPIExecutor = None
+
+    records = []
+    for circuit_kind in circuits:
+        circuit = build_vidal_validation_circuit(circuit_kind, nqubits, nlayers, seed)
+        if rank == 0:
+            exact_values = {
+                obs: exact_pauli_sum(circuit, observable_terms(obs, nqubits), nqubits)
+                for obs in observables
+            }
+        else:
+            exact_values = None
+        if comm is not None:
+            exact_values = comm.bcast(exact_values, root=0)
+
+        for obs_kind in observables:
+            terms = observable_terms(obs_kind, nqubits)
+            start = time.perf_counter()
+            if mpi:
+                executor = SegmentVidalMPIExecutor(
+                    nqubits=nqubits,
+                    max_bond=bond,
+                    cut_ratio=1e-12,
+                    tensor_module=tensor_module,
+                    comm=comm,
+                )
+                executor.run_circuit(circuit)
+                value = executor.expectation_pauli_sum_root(terms)
+            else:
+                executor = VidalTEBDExecutor(
+                    nqubits=nqubits,
+                    max_bond=bond,
+                    cut_ratio=1e-12,
+                    tensor_module=tensor_module,
+                )
+                executor.run_circuit(circuit)
+                value = float(executor.expectation_pauli_sum(terms))
+            if rank != 0:
+                continue
+            seconds = time.perf_counter() - start
+            exact = exact_values[obs_kind]
+            records.append(
+                {
+                    "circuit": circuit_kind,
+                    "observable": obs_kind,
+                    "exact": exact,
+                    "value": value,
+                    "abs_error": abs(value - exact),
+                    "seconds": seconds,
+                }
+            )
+    return records
--- a/src/qibotn/backends/vidal_mpi_segment.py
+++ b/src/qibotn/backends/vidal_mpi_segment.py
@@ -0,0 +1,524 @@
+"""Segmented MPI Vidal/TEBD executor.
+
+Each rank owns a contiguous interval of sites. Gates fully inside an interval
+are applied locally. Only two-site gates crossing a rank boundary communicate
+the neighboring edge tensor and the resulting boundary update.
+"""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+
+import numpy as np
+from mpi4py import MPI
+
+from qibotn.backends.vidal_tebd import (
+    _asarray,
+    _backend_module,
+    _build_theta_svd_matrix,
+    _disjoint_batches,
+    _fuse_one_site_blocks,
+    _gate_sites,
+    _is_two_qubit_batch,
+    _make_two_site_update,
+    _ones,
+    _real_if_close,
+    _route_non_adjacent_gates,
+    _svd,
+    _tensor_update_from_numpy,
+    _tensor_update_to_numpy,
+    _to_float,
+    _to_numpy,
+    _transpose,
+    VidalTEBDExecutor,
+)
+
+_EDGE_TAG = 1701
+_UPDATE_TAG = 1702
+_EXPECT_ENV_TAG = 1703
+_EXPECT_RESULT_TAG = 1704
+
+
+def _partition_sites(nsites, nranks):
+    base = nsites // nranks
+    rem = nsites % nranks
+    starts = [0]
+    for rank in range(nranks):
+        starts.append(starts[-1] + base + int(rank < rem))
+    return starts
+
+
+@dataclass
+class SegmentVidalMPIExecutor:
+    nqubits: int
+    max_bond: int | None
+    comm: object
+    cut_ratio: float | None = 1e-12
+    tensor_module: str = "torch"
+
+    def __post_init__(self):
+        self.rank = self.comm.Get_rank()
+        self.size = self.comm.Get_size()
+        self.starts = _partition_sites(self.nqubits, self.size)
+        self.start = self.starts[self.rank]
+        self.end = self.starts[self.rank + 1]
+        if self.start == self.end:
+            raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.")
+
+        from qibotn.backends.cpu import _bind_numa_node
+
+        self.numa_domain = _bind_numa_node(self.rank)
+
+        self.xp = _backend_module(self.tensor_module)
+        if self.xp is np:
+            self.dtype = np.complex128
+            self.device = None
+        else:
+            self.dtype = self.xp.complex128
+            self.device = self.xp.device("cpu")
+
+        self.gammas = {}
+        for site in range(self.start, self.end):
+            self.gammas[site] = _asarray(
+                self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype
+            )
+
+        self.lambdas = {
+            bond: _ones(self.xp, 1, self.dtype, self.device)
+            for bond in range(self.start, self.end + 1)
+        }
+        self._accumulated_truncation_error = 0.0
+        self._max_truncation_error = 0.0
+
+    @property
+    def truncation_error(self):
+        return self._accumulated_truncation_error
+
+    def global_truncation_error(self):
+        return self.comm.allreduce(self._accumulated_truncation_error, op=MPI.SUM)
+
+    @property
+    def max_truncation_error(self):
+        return self._max_truncation_error
+
+    def global_max_truncation_error(self):
+        return self.comm.allreduce(self._max_truncation_error, op=MPI.MAX)
+
+    def owns_site(self, site):
+        return self.start <= site < self.end
+
+    def owner_of(self, site):
+        return int(np.searchsorted(self.starts, site, side="right") - 1)
+
+    def run_circuit(self, circuit, compile_circuit=True):
+        timings = {
+            "local_compute": 0.0,
+            "edge_exchange": 0.0,
+            "boundary_compute": 0.0,
+            "boundary_update": 0.0,
+            "one_site": 0.0,
+            "gather": 0.0,
+        }
+
+        gates = circuit.queue
+        if compile_circuit:
+            gates = _route_non_adjacent_gates(gates, circuit.nqubits)
+            gates = _fuse_one_site_blocks(gates)
+        for batch in _disjoint_batches(gates):
+            if _is_two_qubit_batch(batch):
+                self._apply_two_site_batch(batch, timings)
+            else:
+                tic = time.perf_counter()
+                for gate in batch:
+                    sites = _gate_sites(gate)
+                    if len(sites) == 1 and self.owns_site(sites[0]):
+                        op = _asarray(self.xp, gate.matrix(), self.dtype)
+                        self.apply_one_site(op, sites[0])
+                    elif len(sites) == 2:
+                        self._apply_two_site_batch([gate], timings)
+                    elif len(sites) > 2:
+                        raise NotImplementedError("Only one- and two-qubit gates are supported.")
+                timings["one_site"] += time.perf_counter() - tic
+
+        return timings
+
+    def apply_one_site(self, op, pos):
+        self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos])
+
+    def _apply_two_site_batch(self, batch, timings):
+        local_gates = []
+        boundary_specs = []
+        recv_left_update = False
+        for gate in batch:
+            sites = _gate_sites(gate)
+            if abs(sites[0] - sites[1]) != 1:
+                raise NotImplementedError("Segment Vidal supports adjacent two-qubit gates only.")
+            left, right = sorted(sites)
+            left_owner = self.owner_of(left)
+            right_owner = self.owner_of(right)
+            if left_owner == self.rank and right_owner == self.rank:
+                local_gates.append(gate)
+            elif left_owner == self.rank:
+                boundary_specs.append((gate, left, right))
+            elif right_owner == self.rank:
+                recv_left_update = True
+
+        tic = time.perf_counter()
+        edge_send_req = None
+        if recv_left_update:
+            edge_send_req = self.comm.isend(
+                self._edge_payload(), dest=self.rank - 1, tag=_EDGE_TAG
+            )
+        right_edge = (
+            self.comm.recv(source=self.rank + 1, tag=_EDGE_TAG)
+            if boundary_specs
+            else None
+        )
+        timings["edge_exchange"] += time.perf_counter() - tic
+
+        boundary_update = None
+        tic = time.perf_counter()
+        for gate, left, right in boundary_specs:
+            boundary_update = self._compute_boundary_update(
+                gate, left, right, right_edge
+            )
+        timings["boundary_compute"] += time.perf_counter() - tic
+
+        tic = time.perf_counter()
+        update_send_req = None
+        if boundary_update is not None:
+            update_send_req = self.comm.isend(
+                boundary_update, dest=self.rank + 1, tag=_UPDATE_TAG
+            )
+        timings["boundary_update"] += time.perf_counter() - tic
+
+        tic = time.perf_counter()
+        local_items = [
+            self._compute_owned_two_site_update(gate)
+            for gate in local_gates
+        ]
+        timings["local_compute"] += time.perf_counter() - tic
+
+        tic = time.perf_counter()
+        left_boundary_update = (
+            self.comm.recv(source=self.rank - 1, tag=_UPDATE_TAG)
+            if recv_left_update
+            else None
+        )
+        if update_send_req is not None:
+            update_send_req.wait()
+        if edge_send_req is not None:
+            edge_send_req.wait()
+        timings["boundary_update"] += time.perf_counter() - tic
+
+        for update in local_items:
+            self._install_update(update)
+        if boundary_update is not None:
+            self._install_update(boundary_update)
+        if left_boundary_update is not None:
+            self._install_update(left_boundary_update)
+
+    def _edge_payload(self):
+        return {
+            "start": self.start,
+            "end": self.end,
+            "gamma_start": _to_numpy(self.gammas[self.start]),
+            "lambda_after_start": _to_numpy(self.lambdas[self.start + 1]),
+        }
+
+    def _compute_owned_two_site_update(self, gate):
+        sites = _gate_sites(gate)
+        op = _asarray(self.xp, gate.matrix(), self.dtype)
+        left, right = sites
+        if left > right:
+            left, right = right, left
+            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4)
+        item = self._build_item(
+            left,
+            op,
+            self.lambdas[left],
+            self.lambdas[left + 1],
+            self.lambdas[left + 2],
+            self.gammas[left],
+            self.gammas[right],
+        )
+        split = _svd(self.xp, item["matrix"])
+        return _make_two_site_update(
+            item, *split, self.max_bond, self.cut_ratio, self.xp
+        )
+
+    def _compute_boundary_update(self, gate, left, right, remote):
+        op = _asarray(self.xp, gate.matrix(), self.dtype)
+        sites = _gate_sites(gate)
+        if sites[0] > sites[1]:
+            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4)
+
+        gamma_right = _asarray(self.xp, remote["gamma_start"], self.dtype)
+        lam_right = _asarray(
+            self.xp,
+            remote["lambda_after_start"],
+            self.xp.float64 if self.xp is not np else np.float64,
+        )
+        item = self._build_item(
+            left,
+            op,
+            self.lambdas[left],
+            self.lambdas[left + 1],
+            lam_right,
+            self.gammas[left],
+            gamma_right,
+        )
+        split = _svd(self.xp, item["matrix"])
+        return _tensor_update_to_numpy(
+            _make_two_site_update(
+                item, *split, self.max_bond, self.cut_ratio, self.xp
+            )
+        )
+
+    def _build_item(self, site, op, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
+        result = _build_theta_svd_matrix(
+            op, self.xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right
+        )
+        result["site"] = site
+        result["lam_left"] = lam_left
+        result["lam_right"] = lam_right
+        return result
+
+    def _install_update(self, update):
+        if isinstance(update["left"], np.ndarray):
+            update = _tensor_update_from_numpy(self.xp, update, self.dtype)
+        truncation_error = update.get("truncation_error", 0.0)
+        self._accumulated_truncation_error += truncation_error
+        self._max_truncation_error = max(
+            self._max_truncation_error,
+            truncation_error,
+        )
+        site = update["site"]
+        if self.owns_site(site):
+            self.gammas[site] = update["left"]
+        if self.owns_site(site + 1):
+            self.gammas[site + 1] = update["right"]
+        if self.start <= site + 1 <= self.end:
+            self.lambdas[site + 1] = update["lambda"]
+
+    def gather_full_state(self):
+        payload = {
+            "start": self.start,
+            "end": self.end,
+            "gammas": {site: _to_numpy(tensor) for site, tensor in self.gammas.items()},
+            "lambdas": {bond: _to_numpy(tensor) for bond, tensor in self.lambdas.items()},
+        }
+        return self.comm.gather(payload, root=0)
+
+    def expectation_pauli_sum_root(self, terms, term_batch_size=None):
+        paulis = {
+            "I": self._eye(2),
+            "X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype),
+            "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
+            "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
+        }
+        operator_terms = [
+            (
+                coeff,
+                tuple((site, paulis[name.upper()]) for name, site in ops),
+            )
+            for coeff, ops in terms
+        ]
+        return self.expectation_operator_sum_root(
+            operator_terms,
+            term_batch_size=term_batch_size,
+        )
+
+    def expectation_operator_sum_root(self, terms, term_batch_size=None):
+        if term_batch_size is None:
+            term_batch_size = max(1, len(terms))
+        norm = self._distributed_product_expectation({})
+        total = 0.0 + 0.0j
+        for start in range(0, len(terms), int(term_batch_size)):
+            batch = terms[start : start + int(term_batch_size)]
+            values = self._distributed_operator_batch_expectation(batch, norm)
+            if self.rank == 0:
+                for (coeff, _), term_value in zip(batch, values):
+                    total += complex(coeff) * complex(term_value)
+        return None if self.rank != 0 else _real_if_close(total / norm)
+
+    def _eye(self, size):
+        if self.xp is np:
+            return np.eye(size, dtype=self.dtype)
+        return self.xp.eye(size, dtype=self.dtype, device=self.device)
+
+    def _distributed_product_expectation(self, operators):
+        if self.rank == 0:
+            env = self._segment_product_environment(operators)
+            if self.size == 1:
+                return env.reshape(-1)[0]
+            self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
+            return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
+
+        incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
+        env = self._segment_product_environment(operators, incoming)
+        if self.rank == self.size - 1:
+            self.comm.send(_to_numpy(env).reshape(-1)[0], dest=0, tag=_EXPECT_RESULT_TAG)
+        else:
+            self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
+        return None
+
+    def _segment_product_environment(self, operators, incoming=None):
+        if incoming is None:
+            env = _asarray(
+                self.xp,
+                np.eye(len(self.lambdas[self.start]), dtype=np.complex128),
+                self.dtype,
+            )
+        else:
+            env = _asarray(self.xp, incoming, self.dtype)
+
+        identity = self._eye(2)
+        for site in range(self.start, self.end):
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            op = operators.get(site, identity)
+            env = self.xp.einsum(
+                "xy,xsb,st,ytd->bd", env, self._conj(tensor), op, tensor
+            )
+        return env
+
+    def _distributed_operator_batch_expectation(self, terms, norm):
+        if not terms:
+            return []
+        if all(not ops for _, ops in terms):
+            return [norm] * len(terms) if self.rank == 0 else None
+
+        batch_ops = [
+            {int(site): _asarray(self.xp, matrix, self.dtype) for site, matrix in ops}
+            for _, ops in terms
+        ]
+        if self.rank == 0:
+            env = self._segment_operator_batch_environment(batch_ops)
+            if self.size == 1:
+                return list(env.reshape(len(terms), -1)[:, 0])
+            self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
+            return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
+
+        incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
+        env = self._segment_operator_batch_environment(batch_ops, incoming)
+        if self.rank == self.size - 1:
+            values = list(_to_numpy(env).reshape(len(terms), -1)[:, 0])
+            self.comm.send(values, dest=0, tag=_EXPECT_RESULT_TAG)
+        else:
+            self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
+        return None
+
+    def _segment_operator_batch_environment(self, batch_ops, incoming=None):
+        batch_size = len(batch_ops)
+        if incoming is None:
+            dim = len(self.lambdas[self.start])
+            env = _asarray(
+                self.xp,
+                np.tile(np.eye(dim, dtype=np.complex128), (batch_size, 1, 1)),
+                self.dtype,
+            )
+        else:
+            env = _asarray(self.xp, incoming, self.dtype)
+
+        identity = self._eye(2)
+        for site in range(self.start, self.end):
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            ops = self.xp.stack(
+                [operators.get(site, identity) for operators in batch_ops],
+                axis=0,
+            )
+            env = self.xp.einsum(
+                "nxy,xsb,nst,ytd->nbd",
+                env,
+                self._conj(tensor),
+                ops,
+                tensor,
+            )
+        return env
+
+    def _conj(self, tensor):
+        return np.conjugate(tensor) if self.xp is np else tensor.conj()
+
+    def expectation_mpo_root(self, mpo_tensors):
+        if len(mpo_tensors) != self.nqubits:
+            raise ValueError(
+                f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}."
+            )
+        norm = self._distributed_product_expectation({})
+        if self.rank == 0:
+            env = self._segment_mpo_environment(mpo_tensors)
+            if self.size == 1:
+                return _real_if_close(env.reshape(-1)[0] / norm)
+            self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
+            value = self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
+            return _real_if_close(value / norm)
+
+        incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
+        env = self._segment_mpo_environment(mpo_tensors, incoming)
+        if self.rank == self.size - 1:
+            self.comm.send(
+                _to_numpy(env).reshape(-1)[0],
+                dest=0,
+                tag=_EXPECT_RESULT_TAG,
+            )
+        else:
+            self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
+        return None
+
+    def _segment_mpo_environment(self, mpo_tensors, incoming=None):
+        if incoming is None:
+            left_dim = len(self.lambdas[self.start])
+            env = _asarray(
+                self.xp,
+                np.zeros((left_dim, 1, left_dim), dtype=np.complex128),
+                self.dtype,
+            )
+            env[:, 0, :] = self._eye(left_dim)
+        else:
+            env = _asarray(self.xp, incoming, self.dtype)
+
+        for site in range(self.start, self.end):
+            mpo = _asarray(self.xp, mpo_tensors[site], self.dtype)
+            if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2):
+                raise ValueError(
+                    "Each MPO tensor must have shape "
+                    "(left_bond, 2, 2, right_bond)."
+                )
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            env = self.xp.einsum(
+                "xlc,xub,lutr,ctd->brd",
+                env,
+                self._conj(tensor),
+                mpo,
+                tensor,
+            )
+        return env
+
+    def expectation_ring_xz_root(self):
+        terms = [
+            (0.5, (("X", site), ("Z", (site + 1) % self.nqubits)))
+            for site in range(self.nqubits)
+        ]
+        return self.expectation_pauli_sum_root(terms)
+
+
+def run_segment_vidal_mpi_ring_xz(
+    circuit,
+    max_bond,
+    comm,
+    cut_ratio=1e-12,
+    tensor_module="torch",
+):
+    executor = SegmentVidalMPIExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=max_bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+        comm=comm,
+    )
+    timings = executor.run_circuit(circuit)
+    tic = time.perf_counter()
+    value = executor.expectation_ring_xz_root()
+    timings["gather"] = time.perf_counter() - tic
+    return value, timings
--- a/src/qibotn/backends/vidal_tebd.py
+++ b/src/qibotn/backends/vidal_tebd.py
@@ -0,0 +1,605 @@
+"""Vidal/TEBD MPS executor for layer-parallel circuit simulation.
+
+This module is intentionally small and focused on the circuit family used by the
+MPS benchmarks: one-qubit gates and adjacent two-qubit gates on a 1D chain.  It
+keeps the state in Vidal form, so gates acting on disjoint bonds can be applied
+in parallel without moving a global mixed-canonical center.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+
+
+def _backend_module(tensor_module):
+    if tensor_module == "torch":
+        import torch
+
+        return torch
+    if tensor_module == "numpy":
+        return np
+    raise ValueError(f"Unsupported tensor module {tensor_module!r}.")
+
+
+def _asarray(xp, value, dtype):
+    if xp is np:
+        return np.asarray(value, dtype=dtype)
+    return xp.as_tensor(value, dtype=dtype)
+
+
+def _ones(xp, size, dtype, device=None):
+    if xp is np:
+        return np.ones(size, dtype=np.float64 if dtype == np.complex128 else np.float32)
+    real_dtype = xp.float64 if dtype == xp.complex128 else xp.float32
+    return xp.ones(size, dtype=real_dtype, device=device)
+
+
+def _eye(xp, size, dtype, device=None):
+    if xp is np:
+        return np.eye(size, dtype=dtype)
+    return xp.eye(size, dtype=dtype, device=device)
+
+
+def _conj(xp, tensor):
+    return np.conjugate(tensor) if xp is np else tensor.conj()
+
+
+def _transpose(xp, tensor, axes):
+    return np.transpose(tensor, axes) if xp is np else tensor.permute(*axes)
+
+
+def _vdot(xp, left, right):
+    if xp is np:
+        return np.vdot(left.reshape(-1), right.reshape(-1))
+    return xp.vdot(left.reshape(-1), right.reshape(-1))
+
+
+def _to_float(x):
+    if hasattr(x, "detach"):
+        return float(x.detach().cpu().item())
+    return float(x)
+
+
+def _to_scalar(x):
+    if hasattr(x, "detach"):
+        return x.detach().cpu().item()
+    if isinstance(x, np.ndarray):
+        return x.item()
+    return x
+
+
+def _real_if_close(x, tol=1000):
+    value = np.real_if_close(x, tol=tol)
+    return value.item() if isinstance(value, np.ndarray) else value
+
+
+def _to_numpy(tensor):
+    if hasattr(tensor, "detach"):
+        return tensor.detach().cpu().numpy()
+    return np.asarray(tensor)
+
+
+def _tensor_update_to_numpy(update):
+    result = {
+        "site": int(update["site"]),
+        "left": _to_numpy(update["left"]),
+        "right": _to_numpy(update["right"]),
+        "lambda": _to_numpy(update["lambda"]),
+    }
+    if "truncation_error" in update:
+        result["truncation_error"] = float(update["truncation_error"])
+    return result
+
+
+def _tensor_update_from_numpy(xp, update, dtype):
+    if xp is np:
+        return update
+    result = {
+        "site": update["site"],
+        "left": _asarray(xp, update["left"], dtype),
+        "right": _asarray(xp, update["right"], dtype),
+        "lambda": xp.as_tensor(
+            update["lambda"],
+            dtype=xp.float64 if dtype == xp.complex128 else xp.float32,
+        ),
+    }
+    if "truncation_error" in update:
+        result["truncation_error"] = float(update["truncation_error"])
+    return result
+
+
+def _svd(xp, matrix):
+    return _svd_eigh(xp, matrix)
+
+
+def _svd_eigh(xp, matrix):
+    """SVD through Hermitian eigendecomposition.
+
+    This mirrors the E-style path that is fast for the benchmark matrices and
+    avoids torch's slower general-purpose SVD for many small/medium splits.
+    """
+
+    m_dim, n_dim = matrix.shape
+    if m_dim <= n_dim:
+        gram = matrix @ _conj(xp, matrix).T
+        eigvals, eigvecs = _eigh(xp, gram)
+        eigvals, eigvecs = _sort_eigh_desc(xp, eigvals, eigvecs)
+        singvals = _sqrt_clamped(xp, eigvals)
+        inv_s = _safe_inverse(xp, singvals)
+        vh = (_conj(xp, eigvecs).T @ matrix) * inv_s.reshape(-1, 1)
+        return eigvecs, singvals, vh
+
+    gram = _conj(xp, matrix).T @ matrix
+    eigvals, eigvecs = _eigh(xp, gram)
+    eigvals, eigvecs = _sort_eigh_desc(xp, eigvals, eigvecs)
+    singvals = _sqrt_clamped(xp, eigvals)
+    inv_s = _safe_inverse(xp, singvals)
+    umat = (matrix @ eigvecs) * inv_s.reshape(1, -1)
+    return umat, singvals, _conj(xp, eigvecs).T
+
+
+def _eigh(xp, matrix):
+    if xp is np:
+        return np.linalg.eigh(matrix)
+    return xp.linalg.eigh(matrix)
+
+
+def _sort_eigh_desc(xp, eigvals, eigvecs):
+    if xp is np:
+        return eigvals[::-1].copy(), eigvecs[:, ::-1].copy()
+    return xp.flip(eigvals, dims=(0,)), xp.flip(eigvecs, dims=(1,))
+
+
+def _sqrt_clamped(xp, eigvals):
+    if xp is np:
+        return np.sqrt(np.maximum(eigvals.real, 0.0))
+    return xp.sqrt(xp.clamp(eigvals.real, min=0.0))
+
+
+def _safe_inverse(xp, values):
+    if xp is np:
+        return np.where(values > 1e-300, 1.0 / values, 0.0)
+    return xp.where(values > 1e-300, 1.0 / values, xp.zeros_like(values))
+
+
+@dataclass
+class VidalTEBDExecutor:
+    nqubits: int
+    max_bond: int | None
+    cut_ratio: float | None = 1e-12
+    tensor_module: str = "torch"
+
+    def __post_init__(self):
+        self.xp = _backend_module(self.tensor_module)
+        if self.xp is np:
+            self.dtype = np.complex128
+            self.device = None
+        else:
+            self.dtype = self.xp.complex128
+            self.device = self.xp.device("cpu")
+
+        self.gammas = []
+        for _ in range(self.nqubits):
+            tensor = _asarray(self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype)
+            self.gammas.append(tensor)
+        self.lambdas = [
+            _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
+        ]
+        self._accumulated_truncation_error = 0.0
+        self._max_truncation_error = 0.0
+
+    def run_circuit(self, circuit, compile_circuit=True):
+        gates = circuit.queue
+        if compile_circuit:
+            gates = _route_non_adjacent_gates(gates, circuit.nqubits)
+            gates = _fuse_one_site_blocks(gates)
+        for batch in _disjoint_batches(gates):
+            for gate in batch:
+                self._apply_gate(gate)
+
+    @property
+    def truncation_error(self):
+        return self._accumulated_truncation_error
+
+    @property
+    def max_truncation_error(self):
+        return self._max_truncation_error
+
+    def _apply_gate(self, gate):
+        sites = _gate_sites(gate)
+        matrix = _asarray(self.xp, gate.matrix(), self.dtype)
+        if len(sites) == 1:
+            self.apply_one_site(matrix, sites[0])
+        elif len(sites) == 2:
+            if abs(sites[0] - sites[1]) != 1:
+                raise NotImplementedError("VidalTEBDExecutor supports adjacent gates only.")
+            self.apply_two_site(matrix, sites[0], sites[1])
+        else:
+            raise NotImplementedError("Only one- and two-qubit gates are supported.")
+
+    def apply_one_site(self, op, pos):
+        # op[out_phys, in_phys] * gamma[left, in_phys, right]
+        self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos])
+
+    def apply_two_site(self, op, left_pos, right_pos):
+        item = self._build_two_site_matrix(op, left_pos, right_pos)
+        umat, singvals, vh = _svd(self.xp, item["matrix"])
+        self._install_two_site_split(item, umat, singvals, vh)
+
+    def _build_two_site_matrix(self, op, left_pos, right_pos):
+        if left_pos > right_pos:
+            left_pos, right_pos = right_pos, left_pos
+            op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(
+                4, 4
+            )
+
+        i = left_pos
+        result = _build_theta_svd_matrix(
+            op, self.xp,
+            self.lambdas[i], self.lambdas[i + 1], self.lambdas[i + 2],
+            self.gammas[i], self.gammas[i + 1],
+        )
+        result["site"] = i
+        result["lam_left"] = self.lambdas[i]
+        result["lam_right"] = self.lambdas[i + 2]
+        return result
+
+    def _install_two_site_split(self, item, umat, singvals, vh):
+        update = _make_two_site_update(item, umat, singvals, vh,
+                                       self.max_bond, self.cut_ratio, self.xp)
+        self._accumulated_truncation_error += update["truncation_error"]
+        self._max_truncation_error = max(
+            self._max_truncation_error,
+            update["truncation_error"],
+        )
+        i = update["site"]
+        self.gammas[i] = update["left"]
+        self.gammas[i + 1] = update["right"]
+        self.lambdas[i + 1] = update["lambda"]
+
+    def expectation_ring_xz(self):
+        return self.expectation_pauli_sum(
+            [
+                (0.5, (("X", site), ("Z", (site + 1) % self.nqubits)))
+                for site in range(self.nqubits)
+            ]
+        )
+
+    def expectation_pauli_sum(self, terms):
+        paulis = {
+            "I": _eye(self.xp, 2, self.dtype, self.device),
+            "X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype),
+            "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
+            "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
+        }
+        operator_terms = [
+            (
+                coeff,
+                tuple((site, paulis[name.upper()]) for name, site in ops),
+            )
+            for coeff, ops in terms
+        ]
+        return self.expectation_operator_sum(operator_terms)
+
+    def expectation_operator_sum(self, terms):
+        value = 0.0 + 0.0j
+        norm = self.norm()
+        for coeff, ops in terms:
+            operators = {
+                int(site): _asarray(self.xp, matrix, self.dtype)
+                for site, matrix in ops
+            }
+            if len(ops) == 0:
+                term_value = norm
+            elif len(operators) == 1:
+                site, matrix = next(iter(operators.items()))
+                term_value = _to_scalar(self._expect_one_site(site, matrix))
+            elif len(operators) == 2 and abs(max(operators) - min(operators)) == 1:
+                site0, site1 = sorted(operators)
+                term_value = _to_scalar(
+                    self._expect_adjacent(site0, operators[site0], operators[site1])
+                )
+            else:
+                term_value = _to_scalar(self.expect_product_operators(operators))
+            value += complex(coeff) * complex(term_value)
+        return _real_if_close(value / norm)
+
+    def _expect_one_site(self, site, op):
+        theta = self.xp.einsum(
+            "a,asb,b->asb",
+            self.lambdas[site],
+            self.gammas[site],
+            self.lambdas[site + 1],
+        )
+        op_theta = self.xp.einsum("us,asb->aub", op, theta)
+        return _vdot(self.xp, theta, op_theta)
+
+    def _expect_adjacent(self, site, op_left, op_right):
+        theta = self.xp.einsum(
+            "a,asb,b,btc,c->astc",
+            self.lambdas[site],
+            self.gammas[site],
+            self.lambdas[site + 1],
+            self.gammas[site + 1],
+            self.lambdas[site + 2],
+        )
+        op_theta = self.xp.einsum("us,vt,astc->auvc", op_left, op_right, theta)
+        return _vdot(self.xp, theta, op_theta)
+
+    def expect_product_operators(self, operators):
+        env = _asarray(self.xp, [[1.0 + 0.0j]], self.dtype)
+        identity = _eye(self.xp, 2, self.dtype, self.device)
+        for site in range(self.nqubits):
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            op = operators.get(site, identity)
+            env = self.xp.einsum(
+                "xy,xsb,st,ytd->bd", env, _conj(self.xp, tensor), op, tensor
+            )
+        return env.reshape(-1)[0]
+
+    def norm(self):
+        return float(np.real(_to_scalar(self.expect_product_operators({}))))
+
+    def expectation_mpo(self, mpo_tensors):
+        """Compute ``<psi|MPO|psi> / <psi|psi>``.
+
+        MPO tensors are expected in ``(left_bond, phys_out, phys_in, right_bond)``
+        order, with physical dimension 2 on every site.
+        """
+        if len(mpo_tensors) != self.nqubits:
+            raise ValueError(
+                f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}."
+            )
+        env = _asarray(self.xp, [[[1.0 + 0.0j]]], self.dtype)
+        for site, raw_mpo in enumerate(mpo_tensors):
+            mpo = _asarray(self.xp, raw_mpo, self.dtype)
+            if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2):
+                raise ValueError(
+                    "Each MPO tensor must have shape "
+                    "(left_bond, 2, 2, right_bond)."
+                )
+            tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
+            env = self.xp.einsum(
+                "xlc,xub,lutr,ctd->brd",
+                env,
+                _conj(self.xp, tensor),
+                mpo,
+                tensor,
+            )
+        return _real_if_close(_to_scalar(env.reshape(-1)[0]) / self.norm())
+
+
+def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
+    """Merge and apply a two-site gate, returning the SVD-ready matrix."""
+    theta = xp.einsum(
+        "a,asb,b,btc,c->astc",
+        lam_left, gamma_left, lam_mid, gamma_right, lam_right,
+    )
+    gate = op.reshape(2, 2, 2, 2)
+    theta = xp.einsum("uvst,astc->auvc", gate, theta)
+    chi_left = theta.shape[0]
+    chi_right = theta.shape[3]
+    return {
+        "chi_left": chi_left,
+        "chi_right": chi_right,
+        "matrix": theta.reshape(chi_left * 2, 2 * chi_right),
+    }
+
+
+def _choose_bond(singvals, max_bond, cut_ratio, xp):
+    max_possible = int(singvals.shape[0])
+    keep = max_possible if max_bond is None else min(max_possible, int(max_bond))
+    if cut_ratio is not None and cut_ratio > 0 and max_possible > 0:
+        threshold = singvals[0] * cut_ratio
+        if xp is np:
+            ratio_keep = int(np.count_nonzero(singvals > threshold))
+        else:
+            ratio_keep = int((singvals > threshold).sum().detach().cpu().item())
+        keep = min(keep, max(1, ratio_keep))
+    return keep
+
+
+def _divide_left_lambda(tensor, lambdas, xp):
+    if xp is np:
+        safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
+    else:
+        safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas))
+    return tensor / safe.reshape(-1, 1, 1)
+
+
+def _divide_right_lambda(tensor, lambdas, xp):
+    if xp is np:
+        safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0)
+    else:
+        safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas))
+    return tensor / safe.reshape(1, 1, -1)
+
+
+def _make_two_site_update(item, umat, singvals, vh, max_bond, cut_ratio, xp):
+    keep = _choose_bond(singvals, max_bond, cut_ratio, xp)
+    umat = umat[:, :keep]
+    kept = singvals[:keep]
+    cut = singvals[keep:]
+    vh = vh[:keep, :]
+
+    discarded_weight = 0.0
+    if cut.shape[0] > 0:
+        norm_kept = (kept * kept).sum()
+        norm_cut = (cut * cut).sum()
+        discarded_weight = float(_to_float(norm_cut))
+        kept = kept / xp.sqrt(norm_kept / (norm_kept + norm_cut))
+
+    new_left = umat.reshape(item["chi_left"], 2, keep)
+    new_right = vh.reshape(keep, 2, item["chi_right"])
+    new_left = _divide_left_lambda(new_left, item["lam_left"], xp)
+    new_right = _divide_right_lambda(new_right, item["lam_right"], xp)
+    return {
+        "site": item["site"],
+        "left": new_left,
+        "right": new_right,
+        "lambda": kept,
+        "truncation_error": discarded_weight,
+    }
+
+
+def _gate_sites(gate):
+    controls = tuple(getattr(gate, "control_qubits", ()))
+    targets = tuple(getattr(gate, "target_qubits", ()))
+    if controls:
+        return controls + targets
+    return targets
+
+
+# ── SWAP routing for non-adjacent two-qubit gates ──────────────────────
+
+class _SWAPGate:
+    """Minimal SWAP gate wrapper for routing non-adjacent gates."""
+    name = "swap"
+    control_qubits = ()
+
+    def __init__(self, left, right):
+        self.target_qubits = (left, right)
+
+    def matrix(self):
+        return np.array(
+            [[1, 0, 0, 0],
+             [0, 0, 1, 0],
+             [0, 1, 0, 0],
+             [0, 0, 0, 1]],
+            dtype=complex,
+        )
+
+
+class _RoutedTwoQubitGate:
+    """Wraps a two-qubit gate with remapped physical sites after SWAP routing."""
+    name = "routed_two_qubit"
+    control_qubits = ()
+
+    def __init__(self, original_gate, physical_sites):
+        self.target_qubits = tuple(physical_sites)
+        self._matrix = original_gate.matrix()
+
+    def matrix(self):
+        return self._matrix
+
+
+def _route_non_adjacent_gates(gates, nqubits):
+    """Insert SWAP networks to make all two-qubit gates adjacent.
+
+    For each non-adjacent two-qubit gate, inserts SWAP gates to bring the
+    farther qubit adjacent, applies the original gate, then inserts reverse
+    SWAPs to restore the qubit ordering.  The resulting gate sequence
+    contains only adjacent two-qubit gates and is safe for VidalTEBDExecutor.
+    """
+    routed = []
+    for gate in gates:
+        sites = _gate_sites(gate)
+        if len(sites) <= 1:
+            routed.append(gate)
+            continue
+
+        left, right = sorted(sites)
+        if right - left == 1:
+            routed.append(gate)
+            continue
+
+        # Move qubit 'right' leftwards to sit at left+1
+        for pos in range(right - 1, left, -1):
+            routed.append(_SWAPGate(pos, pos + 1))
+
+        # Apply the original gate in its original qubit order.  For gates like
+        # CNOT(5, 0), sorting the routed sites would swap control and target.
+        physical_map = {left: left, right: left + 1}
+        routed.append(_RoutedTwoQubitGate(gate, [physical_map[site] for site in sites]))
+
+        # Reverse SWAPs to restore original ordering
+        for pos in range(left + 1, right):
+            routed.append(_SWAPGate(pos, pos + 1))
+
+    return routed
+
+
+def _disjoint_batches(gates):
+    batches = []
+    current = []
+    touched = set()
+    current_arity = None
+    for gate in gates:
+        sites = _gate_sites(gate)
+        arity = len(sites)
+        site_set = set(sites)
+        if current and (current_arity != arity or touched & site_set):
+            batches.append(current)
+            current = []
+            touched = set()
+            current_arity = None
+        current.append(gate)
+        touched |= site_set
+        current_arity = arity
+    if current:
+        batches.append(current)
+    return batches
+
+
+def _is_two_qubit_batch(batch):
+    return batch and all(len(_gate_sites(gate)) == 2 for gate in batch)
+
+
+class _FusedOneSiteGate:
+    name = "fused_one_site"
+
+    def __init__(self, site, matrix):
+        self.target_qubits = (site,)
+        self.control_qubits = ()
+        self._matrix = matrix
+
+    def matrix(self):
+        return self._matrix
+
+
+def _fuse_one_site_blocks(gates):
+    fused = []
+    block = []
+
+    def flush_block():
+        nonlocal block
+        if not block:
+            return
+        per_site = {}
+        for gate in block:
+            site = _gate_sites(gate)[0]
+            mat = gate.matrix()
+            if site in per_site:
+                per_site[site] = mat @ per_site[site]
+            else:
+                per_site[site] = mat
+        for site in sorted(per_site):
+            fused.append(_FusedOneSiteGate(site, per_site[site]))
+        block = []
+
+    for gate in gates:
+        if len(_gate_sites(gate)) == 1:
+            block.append(gate)
+            continue
+        flush_block()
+        fused.append(gate)
+    flush_block()
+    return fused
+
+
+def run_vidal_ring_xz(
+    circuit,
+    max_bond,
+    cut_ratio=1e-12,
+    tensor_module="torch",
+):
+    executor = VidalTEBDExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=max_bond,
+        cut_ratio=cut_ratio,
+        tensor_module=tensor_module,
+    )
+    executor.run_circuit(circuit)
+    return executor.expectation_ring_xz()
--- a/src/qibotn/benchmark_cases.py
+++ b/src/qibotn/benchmark_cases.py
@@ -0,0 +1,171 @@
+"""Reusable benchmark circuits and observables for expectation runs."""
+
+from __future__ import annotations
+
+import math
+
+import numpy as np
+from qibo import Circuit, gates
+
+
+CIRCUITS = (
+    "brickwall_cnot",
+    "reversed_cnot",
+    "shifted_cz",
+    "rx_ry_cz",
+    "rxx_rzz",
+    "swap_scramble",
+    "ghz_ladder",
+)
+
+OBSERVABLES = (
+    "ring_xz",
+    "open_zz",
+    "mixed_local",
+    "range2_xx",
+    "long_z_string",
+)
+
+
+def parse_names(raw, valid, label):
+    if raw == ["all"]:
+        return list(valid)
+    unknown = sorted(set(raw) - set(valid))
+    if unknown:
+        raise ValueError(f"Unknown {label}: {', '.join(unknown)}")
+    return raw
+
+
+def build_circuit(kind, nqubits, nlayers, seed):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    if kind == "ghz_ladder":
+        circuit.add(gates.H(0))
+        for qubit in range(nqubits - 1):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+        return circuit
+
+    for layer in range(nlayers):
+        for qubit in range(nqubits):
+            circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+            if kind in ("rx_ry_cz", "rxx_rzz", "swap_scramble"):
+                circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
+
+        if kind == "brickwall_cnot":
+            add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
+        elif kind == "reversed_cnot":
+            add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
+        elif kind in ("shifted_cz", "rx_ry_cz"):
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+        elif kind == "rxx_rzz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+        elif kind == "swap_scramble":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+                if layer % 4 == 3:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+    return circuit
+
+
+def add_brickwall(circuit, nqubits, gate, layer, reverse):
+    for qubit in range(0, nqubits - 1, 2):
+        if reverse and layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+    for qubit in range(1, nqubits - 1, 2):
+        if reverse and not layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+
+
+def observable_terms(kind, nqubits):
+    if kind == "ring_xz":
+        return [
+            (0.5, (("X", site), ("Z", (site + 1) % nqubits)))
+            for site in range(nqubits)
+        ]
+    if kind == "open_zz":
+        return [
+            (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1)))
+            for site in range(nqubits - 1)
+        ]
+    if kind == "mixed_local":
+        terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))]
+        terms += [
+            (0.125, (("Y", site), ("Y", site + 1)))
+            for site in range(0, nqubits - 1, 3)
+        ]
+        return terms
+    if kind == "range2_xx":
+        return [
+            (1.0 / max(1, nqubits - 2), (("X", site), ("X", site + 2)))
+            for site in range(nqubits - 2)
+        ]
+    if kind == "long_z_string":
+        stride = max(1, nqubits // 16)
+        return [(1.0, tuple(("Z", site) for site in range(0, nqubits, stride)))]
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def terms_to_dict(terms):
+    return {
+        "terms": [
+            {
+                "coefficient": float(np.real(coeff)),
+                "operators": [(name, int(site)) for name, site in ops],
+            }
+            for coeff, ops in terms
+        ]
+    }
+
+
+def exact_pauli_sum(circuit, terms, nqubits):
+    state = circuit().state(numpy=True).reshape(-1)
+    indices = np.arange(state.size, dtype=np.int64)
+    value = 0.0 + 0.0j
+    for coeff, ops in terms:
+        flipped = indices.copy()
+        phase = np.ones(state.size, dtype=np.complex128)
+        for name, site in ops:
+            shift = nqubits - 1 - site
+            bit = (indices >> shift) & 1
+            if name == "X":
+                flipped ^= 1 << shift
+            elif name == "Y":
+                flipped ^= 1 << shift
+                phase *= 1j * (1 - 2 * bit)
+            elif name == "Z":
+                phase *= 1 - 2 * bit
+            elif name != "I":
+                raise ValueError(f"Unsupported Pauli {name!r}.")
+        value += coeff * np.vdot(state[flipped], phase * state)
+    return float(value.real)
+
+
+def ring_xz_statevector_expectation(state, nqubits, chunk_size=1 << 20):
+    """Compute ``0.5 * sum_i X_i Z_(i+1)`` from a dense state vector."""
+    state = np.asarray(state).reshape(-1)
+    value = 0.0
+    for qubit in range(nqubits):
+        next_qubit = (qubit + 1) % nqubits
+        x_flip = 1 << (nqubits - 1 - qubit)
+        z_shift = nqubits - 1 - next_qubit
+        term = 0.0
+        for start in range(0, state.size, chunk_size):
+            stop = min(start + chunk_size, state.size)
+            indices = np.arange(start, stop, dtype=np.int64)
+            z_bit = (indices >> z_shift) & 1
+            z_phase = 1 - 2 * z_bit
+            term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real
+        value += 0.5 * term
+    return float(value)
--- a/src/qibotn/circuit_convertor.py
+++ b/src/qibotn/circuit_convertor.py
@@ -1,246 +0,0 @@
-import cupy as cp
-import numpy as np
-
-# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
-
-
-class QiboCircuitToEinsum:
-    """Convert a circuit to a Tensor Network (TN) representation.
-
-    The circuit is first processed to an intermediate form by grouping each gate matrix
-    with its corresponding qubit it is acting on to a list. It is then converted to an
-    equivalent TN expression through the class function state_vector_operands()
-    following the Einstein summation convention in the interleave format.
-
-    See document for detail of the format: https://docs.nvidia.com/cuda/cuquantum/python/api/generated/cuquantum.contract.html
-
-    The output is to be used by cuQuantum's contract() for computation of the
-    state vectors of the circuit.
-    """
-
-    def __init__(self, circuit, dtype="complex128"):
-        self.backend = cp
-        self.dtype = getattr(self.backend, dtype)
-        self.init_basis_map(self.backend, dtype)
-        self.init_intermediate_circuit(circuit)
-        self.circuit = circuit
-
-    def state_vector_operands(self):
-        """Create the operands for dense vector computation in the interleave
-        format.
-
-        Returns:
-            Operands for the contraction in the interleave format.
-        """
-        input_bitstring = "0" * len(self.active_qubits)
-
-        input_operands = self._get_bitstring_tensors(input_bitstring)
-
-        (
-            mode_labels,
-            qubits_frontier,
-            next_frontier,
-        ) = self._init_mode_labels_from_qubits(self.active_qubits)
-
-        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
-            self.gate_tensors, qubits_frontier, next_frontier
-        )
-
-        operands = input_operands + gate_operands
-        mode_labels += gate_mode_labels
-
-        out_list = []
-        for key in qubits_frontier:
-            out_list.append(qubits_frontier[key])
-
-        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
-        operand_exp_interleave.append(out_list)
-        return operand_exp_interleave
-
-    def _init_mode_labels_from_qubits(self, qubits):
-        n = len(qubits)
-        frontier_dict = {q: i for i, q in enumerate(qubits)}
-        mode_labels = [[i] for i in range(n)]
-        return mode_labels, frontier_dict, n
-
-    def _get_bitstring_tensors(self, bitstring):
-        return [self.basis_map[ibit] for ibit in bitstring]
-
-    def _parse_gates_to_mode_labels_operands(
-        self, gates, qubits_frontier, next_frontier
-    ):
-        mode_labels = []
-        operands = []
-
-        for tensor, gate_qubits in gates:
-            operands.append(tensor)
-            input_mode_labels = []
-            output_mode_labels = []
-            for q in gate_qubits:
-                input_mode_labels.append(qubits_frontier[q])
-                output_mode_labels.append(next_frontier)
-                qubits_frontier[q] = next_frontier
-                next_frontier += 1
-            mode_labels.append(output_mode_labels + input_mode_labels)
-        return mode_labels, operands
-
-    def op_shape_from_qubits(self, nqubits):
-        """Modify tensor to cuQuantum shape.
-
-        Parameters:
-            nqubits (int): The number of qubits in quantum circuit.
-
-        Returns:
-            (qubit_states,input_output) * nqubits
-        """
-        return (2, 2) * nqubits
-
-    def init_intermediate_circuit(self, circuit):
-        """Initialize the intermediate circuit representation.
-
-        This method initializes the intermediate circuit representation by extracting gate matrices and qubit IDs
-        from the given quantum circuit.
-
-        Parameters:
-            circuit (object): The quantum circuit object.
-        """
-        self.gate_tensors = []
-        gates_qubits = []
-
-        for gate in circuit.queue:
-            gate_qubits = gate.control_qubits + gate.target_qubits
-            gates_qubits.extend(gate_qubits)
-
-            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
-            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
-            required_shape = self.op_shape_from_qubits(len(gate_qubits))
-            self.gate_tensors.append(
-                (
-                    cp.asarray(gate.matrix(), dtype=self.dtype).reshape(required_shape),
-                    gate_qubits,
-                )
-            )
-
-        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
-        self.active_qubits = np.unique(gates_qubits)
-
-    def init_basis_map(self, backend, dtype):
-        """Initialize the basis map for the quantum circuit.
-
-        This method initializes a basis map for the quantum circuit, which maps binary
-        strings representing qubit states to their corresponding quantum state vectors.
-
-        Parameters:
-            backend (object): The backend object providing the array conversion method.
-            dtype (object): The data type for the quantum state vectors.
-        """
-        asarray = backend.asarray
-        state_0 = asarray([1, 0], dtype=dtype)
-        state_1 = asarray([0, 1], dtype=dtype)
-
-        self.basis_map = {"0": state_0, "1": state_1}
-
-    def init_inverse_circuit(self, circuit):
-        """Initialize the inverse circuit representation.
-
-        This method initializes the inverse circuit representation by extracting gate matrices and qubit IDs
-        from the given quantum circuit.
-
-        Parameters:
-            circuit (object): The quantum circuit object.
-        """
-        self.gate_tensors_inverse = []
-        gates_qubits_inverse = []
-
-        for gate in circuit.queue:
-            gate_qubits = gate.control_qubits + gate.target_qubits
-            gates_qubits_inverse.extend(gate_qubits)
-
-            # self.gate_tensors is to extract into a list the gate matrix together with the qubit id that it is acting on
-            # https://github.com/NVIDIA/cuQuantum/blob/6b6339358f859ea930907b79854b90b2db71ab92/python/cuquantum/cutensornet/_internal/circuit_parser_utils_cirq.py#L32
-            required_shape = self.op_shape_from_qubits(len(gate_qubits))
-            self.gate_tensors_inverse.append(
-                (
-                    cp.asarray(gate.matrix()).reshape(required_shape),
-                    gate_qubits,
-                )
-            )
-
-        # self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
-        self.active_qubits_inverse = np.unique(gates_qubits_inverse)
-
-    def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp):
-        """Populate the gates for all pauli operators.
-
-        Parameters:
-            pauli_map: A dictionary mapping qubits to pauli operators.
-            dtype: Data type for the tensor operands.
-            backend: The package the tensor operands belong to.
-
-        Returns:
-            A sequence of pauli gates.
-        """
-        asarray = backend.asarray
-        pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
-        pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
-        pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype)
-        pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype)
-
-        operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z}
-        gates = []
-        for qubit, pauli_char in pauli_map.items():
-            operand = operand_map.get(pauli_char)
-            if operand is None:
-                raise ValueError("pauli string character must be one of I/X/Y/Z")
-            gates.append((operand, (qubit,)))
-        return gates
-
-    def expectation_operands(self, ham_gates):
-        """Create the operands for pauli string expectation computation in the
-        interleave format.
-
-        Parameters:
-            ham_gates: A list of gates derived from Qibo hamiltonian object.
-
-        Returns:
-            Operands for the contraction in the interleave format.
-        """
-        input_bitstring = "0" * self.circuit.nqubits
-
-        input_operands = self._get_bitstring_tensors(input_bitstring)
-
-        (
-            mode_labels,
-            qubits_frontier,
-            next_frontier,
-        ) = self._init_mode_labels_from_qubits(range(self.circuit.nqubits))
-
-        gate_mode_labels, gate_operands = self._parse_gates_to_mode_labels_operands(
-            self.gate_tensors, qubits_frontier, next_frontier
-        )
-
-        operands = input_operands + gate_operands
-        mode_labels += gate_mode_labels
-
-        self.init_inverse_circuit(self.circuit.invert())
-
-        next_frontier = max(qubits_frontier.values()) + 1
-
-        gates_inverse = ham_gates + self.gate_tensors_inverse
-
-        (
-            gate_mode_labels_inverse,
-            gate_operands_inverse,
-        ) = self._parse_gates_to_mode_labels_operands(
-            gates_inverse, qubits_frontier, next_frontier
-        )
-        mode_labels = (
-            mode_labels
-            + gate_mode_labels_inverse
-            + [[qubits_frontier[ix]] for ix in range(self.circuit.nqubits)]
-        )
-        operands = operands + gate_operands_inverse + operands[: self.circuit.nqubits]
-
-        operand_exp_interleave = [x for y in zip(operands, mode_labels) for x in y]
-
-        return operand_exp_interleave
--- a/src/qibotn/circuit_to_mps.py
+++ b/src/qibotn/circuit_to_mps.py
@@ -1,47 +0,0 @@
-import cupy as cp
-import cuquantum.bindings.cutensornet as cutn
-import numpy as np
-
-from qibotn.circuit_convertor import QiboCircuitToEinsum
-from qibotn.mps_utils import apply_gate, initial
-
-
-class QiboCircuitToMPS:
-    """A helper class to convert Qibo circuit to MPS.
-
-    Parameters:
-        circ_qibo: The quantum circuit object.
-        gate_algo(dict): Dictionary for SVD and QR settings.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
-        rand_seed(int): Seed for random number generator.
-    """
-
-    def __init__(
-        self,
-        circ_qibo,
-        gate_algo,
-        dtype="complex128",
-        rand_seed=0,
-    ):
-        np.random.seed(rand_seed)
-        cp.random.seed(rand_seed)
-
-        self.num_qubits = circ_qibo.nqubits
-        self.handle = cutn.create()
-        self.dtype = dtype
-        self.mps_tensors = initial(self.num_qubits, dtype=dtype)
-        circuitconvertor = QiboCircuitToEinsum(circ_qibo, dtype=dtype)
-
-        for gate, qubits in circuitconvertor.gate_tensors:
-            # mapping from qubits to qubit indices
-            # apply the gate in-place
-            apply_gate(
-                self.mps_tensors,
-                gate,
-                qubits,
-                algorithm=gate_algo,
-                options={"handle": self.handle},
-            )
-
-    def __del__(self):
-        cutn.destroy(self.handle)
--- a/src/qibotn/contest_cases.py
+++ b/src/qibotn/contest_cases.py
@@ -0,0 +1,241 @@
+"""Shared contest-style circuits and observables for qibotn tools."""
+
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Y, Z
+from qibotn.backends.quimb import quimb_torch_parallel_opts
+
+
+@dataclass(frozen=True)
+class CaseSpec:
+    circuit_kind: str
+    observables: tuple[str, ...]
+    nqubits: int
+    nlayers: int
+    seed: int
+    target_slices: int | None = None
+
+
+CASES = {
+    "main1": CaseSpec(
+        circuit_kind="rxx_rzz_chain",
+        observables=("ring_xz",),
+        nqubits=37,
+        nlayers=20,
+        seed=31001,
+        target_slices=None,
+    ),
+    "main2": CaseSpec(
+        circuit_kind="scramble_chain",
+        observables=("open_zz", "range2_xx"),
+        nqubits=36,
+        nlayers=18,
+        seed=31002,
+        target_slices=None,
+    ),
+    "strong": CaseSpec(
+        circuit_kind="reversed_cnot",
+        observables=("ring_xz", "long_z_string"),
+        nqubits=40,
+        nlayers=24,
+        seed=41001,
+        target_slices=None,
+    ),
+}
+
+
+def _add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
+    for qubit in range(nqubits):
+        circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
+        if include_rx:
+            circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
+
+
+def _add_brickwall(circuit, nqubits, gate, layer, reverse=False):
+    for qubit in range(0, nqubits - 1, 2):
+        if reverse and layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+    for qubit in range(1, nqubits - 1, 2):
+        if reverse and not layer % 2:
+            circuit.add(gate(qubit + 1, qubit))
+        else:
+            circuit.add(gate(qubit, qubit + 1))
+
+
+def build_contest_circuit(kind, nqubits, nlayers, seed):
+    """Build one of the contest-style benchmark circuits."""
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+
+    if kind == "ghz_ladder":
+        circuit.add(gates.H(0))
+        for qubit in range(nqubits - 1):
+            circuit.add(gates.CNOT(qubit, qubit + 1))
+        return circuit
+
+    for layer in range(nlayers):
+        if kind in {"brickwall_cnot", "reversed_cnot", "shifted_cz"}:
+            _add_single_qubit_layer(circuit, nqubits, rng)
+        elif kind in {"rxx_rzz", "swap_scramble"}:
+            _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+        elif kind in {"rxx_rzz_chain", "scramble_chain", "scramble"}:
+            _add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
+        else:
+            raise ValueError(f"Unknown circuit kind {kind!r}.")
+
+        if kind == "brickwall_cnot":
+            _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False)
+        elif kind == "reversed_cnot":
+            _add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True)
+        elif kind == "shifted_cz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+        elif kind == "rxx_rzz":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
+        elif kind == "swap_scramble":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.CZ(qubit, qubit + 1))
+                if layer % 4 == 3:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+        elif kind == "rxx_rzz_chain":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
+        elif kind == "scramble_chain":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+        elif kind == "scramble":
+            for qubit in range(layer % 2, nqubits - 1, 2):
+                circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
+                if layer % 5 == 4:
+                    circuit.add(gates.SWAP(qubit, qubit + 1))
+
+    return circuit
+
+
+def _dense_observable(nqubits, qubits, seed, dim):
+    del nqubits
+    rng = np.random.default_rng(seed)
+    raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
+    matrix = (raw + raw.conj().T) / 2.0
+    matrix = matrix / np.linalg.norm(matrix)
+    return {"matrix": matrix, "qubits": list(qubits)}
+
+
+def build_contest_observable(kind, nqubits, seed=0):
+    """Build one of the shared contest observables."""
+    q1 = nqubits // 4
+    q2 = nqubits // 2
+    q3 = (3 * nqubits) // 4
+    last = nqubits - 1
+
+    if kind == "ring_xz":
+        form = 0
+        for qubit in range(nqubits):
+            form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "open_zz":
+        form = 0
+        for qubit in range(nqubits - 1):
+            form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "range2_xx":
+        form = 0
+        for qubit in range(nqubits - 2):
+            form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "mixed_local":
+        form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "long_z_string":
+        stride = max(1, nqubits // 16)
+        form = None
+        for qubit in range(0, nqubits, stride):
+            form = Z(qubit) if form is None else form * Z(qubit)
+        return hamiltonians.SymbolicHamiltonian(form=form)
+    if kind == "boundary_ZZ_q1":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
+    if kind == "boundary_ZZ_q2":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
+    if kind == "boundary_ZZ_q3":
+        return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
+    if kind == "long_Z_5_sites":
+        return hamiltonians.SymbolicHamiltonian(
+            form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)
+        )
+    if kind == "mixed_XZYZX":
+        return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
+    if kind == "complex_iZ0":
+        return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
+    if kind == "dense2_mid":
+        return _dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
+    if kind == "dense3_spread":
+        return _dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
+    raise ValueError(f"Unknown observable kind {kind!r}.")
+
+
+def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices, merge_gates=True):
+    slice_label = "auto" if target_slices is None else f"s{target_slices}"
+    merge_label = "merge" if merge_gates else "nomerge"
+    return (
+        Path(tree_dir)
+        / f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}_{merge_label}.pkl"
+    )
+
+
+def selected_observables(args, case):
+    if args.observables:
+        return tuple(args.observables)
+    if args.obs_filter:
+        return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
+    return case.observables
+
+
+def apply_case_defaults(args):
+    case = CASES[args.case]
+    if args.nqubits is None:
+        args.nqubits = case.nqubits
+    if args.nlayers is None:
+        args.nlayers = case.nlayers
+    if args.seed is None:
+        args.seed = case.seed
+    if args.tn_target_slices is None:
+        args.tn_target_slices = case.target_slices
+    args.observables = selected_observables(args, case)
+
+
+def build_parallel_opts(args, tree_file=None, search_only=False):
+    return quimb_torch_parallel_opts(
+        target_slices=args.tn_target_slices,
+        target_size=args.tn_target_size,
+        search_workers=args.tn_search_workers,
+        torch_threads=args.torch_threads,
+        search_repeats=args.tn_search_repeats,
+        search_time=args.tn_search_time,
+        search_seed=args.tn_search_seed,
+        merge_gates=args.merge_gates,
+        search_backend=args.tn_search_backend,
+        dask_address=args.dask_address,
+        dask_expected_workers=args.dask_expected_workers,
+        dask_close_workers=args.dask_close_workers,
+        debug_trials=args.tn_debug_trials,
+        search_only=search_only,
+        save_tree_path=str(tree_file) if tree_file is not None else None,
+        load_tree_path=str(tree_file) if tree_file is not None else None,
+        print_stats=False,
+    )
--- a/src/qibotn/csrc/torch_contractor.cpp
+++ b/src/qibotn/csrc/torch_contractor.cpp
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,89 +1,48 @@
-import cupy as cp
-import cuquantum.bindings.cutensornet as cutn
-from cupy.cuda import nccl
-from cupy.cuda.runtime import getDeviceCount
-from cuquantum.tensornet import Network, contract
 from mpi4py import MPI
-from qibo import hamiltonians
-from qibo.symbols import I, X, Y, Z

-from qibotn.circuit_convertor import QiboCircuitToEinsum
-from qibotn.circuit_to_mps import QiboCircuitToMPS
-from qibotn.mps_contraction_helper import MPSContractionHelper
+from qibotn.backends.cutensornet_helpers import (
+    MPSContractionHelper,
+    QiboCircuitToEinsum,
+    QiboCircuitToMPS,
+)
+from qibotn.observables import (
+    build_observable,
+    check_observable,
+    create_hamiltonian_from_dict,
+    extract_gates_and_qubits,
+)
+
+try:
+    import cupy as cp
+    import cuquantum.bindings.cutensornet as cutn
+    from cupy.cuda import nccl
+    from cupy.cuda.runtime import getDeviceCount
+    from cuquantum.tensornet import Network, contract
+except ImportError:  # pragma: no cover - exercised on CPU-only installations
+    cp = None
+    cutn = None
+    nccl = None
+    getDeviceCount = None
+    Network = None
+    contract = None


-def check_observable(observable, circuit_nqubit):
-    """Checks the type of observable and returns the appropriate Hamiltonian."""
-    if observable is None:
-        return build_observable(circuit_nqubit)
-    elif isinstance(observable, dict):
-        return create_hamiltonian_from_dict(observable, circuit_nqubit)
-    elif isinstance(observable, hamiltonians.SymbolicHamiltonian):
-        # TODO: check if the observable is compatible with the circuit
-        return observable
-    else:
-        raise TypeError("Invalid observable type.")
+def _require_cuquantum():
+    if (
+        cp is None
+        or cutn is None
+        or nccl is None
+        or getDeviceCount is None
+        or Network is None
+        or contract is None
+    ):
+        raise ImportError(
+            "The legacy GPU evaluation helpers require cupy and cuquantum. "
+            "Install the GPU dependencies or use the CPU backend."
+        )


-def build_observable(circuit_nqubit):
-    """Helper function to construct a target observable."""
-    hamiltonian_form = 0
-    for i in range(circuit_nqubit):
-        hamiltonian_form += 0.5 * X(i % circuit_nqubit) * Z((i + 1) % circuit_nqubit)
-
-    hamiltonian = hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
-    return hamiltonian
-
-
-def create_hamiltonian_from_dict(data, circuit_nqubit):
-    """Create a Qibo SymbolicHamiltonian from a dictionary representation.
-
-    Ensures that each Hamiltonian term explicitly acts on all circuit qubits
-    by adding identity (`I`) gates where needed.
-
-    Args:
-        data (dict): Dictionary containing Hamiltonian terms.
-        circuit_nqubit (int): Total number of qubits in the quantum circuit.
-
-    Returns:
-        hamiltonians.SymbolicHamiltonian: The constructed Hamiltonian.
-    """
-    PAULI_GATES = {"X": X, "Y": Y, "Z": Z}
-
-    terms = []
-
-    for term in data["terms"]:
-        coeff = term["coefficient"]
-        operators = term["operators"]  # List of tuples like [("Z", 0), ("X", 1)]
-
-        # Convert the operator list into a dictionary {qubit_index: gate}
-        operator_dict = {q: PAULI_GATES[g] for g, q in operators}
-
-        # Build the full term ensuring all qubits are covered
-        full_term_expr = [
-            operator_dict[q](q) if q in operator_dict else I(q)
-            for q in range(circuit_nqubit)
-        ]
-
-        # Multiply all operators together to form a single term
-        term_expr = full_term_expr[0]
-        for op in full_term_expr[1:]:
-            term_expr *= op
-
-        # Scale by the coefficient
-        final_term = coeff * term_expr
-        terms.append(final_term)
-
-    if not terms:
-        raise ValueError("No valid Hamiltonian terms were added.")
-
-    # Combine all terms
-    hamiltonian_form = sum(terms)
-
-    return hamiltonians.SymbolicHamiltonian(hamiltonian_form)
-
-
-def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
+def get_ham_gates(pauli_map, dtype="complex128", backend=None):
    """Populate the gates for all pauli operators.

    Parameters:
@@ -94,6 +53,13 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
    Returns:
        A sequence of pauli gates.
    """
+    if backend is None:
+        backend = cp
+    if backend is None:
+        raise ImportError(
+            "get_ham_gates requires an array backend; cupy is unavailable "
+            "in this CPU-only environment."
+        )
    asarray = backend.asarray
    pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
    pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
@@ -111,47 +77,9 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
    return gates


-def extract_gates_and_qubits(hamiltonian):
-    """
-    Extracts the gates and their corresponding qubits from a Qibo Hamiltonian.
-
-    Parameters:
-        hamiltonian (qibo.hamiltonians.Hamiltonian or qibo.hamiltonians.SymbolicHamiltonian):
-            A Qibo Hamiltonian object.
-
-    Returns:
-        list of tuples: [(coefficient, [(gate, qubit), ...]), ...]
-            - coefficient: The prefactor of the term.
-            - list of (gate, qubit): Each term's gates and the qubits they act on.
-    """
-    extracted_terms = []
-
-    if isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
-        for term in hamiltonian.terms:
-            coeff = term.coefficient  # Extract coefficient
-            gate_qubit_list = []
-
-            # Extract gate and qubit information
-            for factor in term.factors:
-                gate_name = str(factor)[
-                    0
-                ]  # Extract the gate type (X, Y, Z) from 'X0', 'Z1'
-                qubit = int(str(factor)[1:])  # Extract the qubit index
-                gate_qubit_list.append((qubit, gate_name, coeff))
-                coeff = 1.0
-
-            extracted_terms.append(gate_qubit_list)
-
-    else:
-        raise ValueError(
-            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
-        )
-
-    return extracted_terms
-
-
 def initialize_mpi():
    """Initialize MPI communication and device selection."""
+    _require_cuquantum()
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
@@ -162,6 +90,7 @@ def initialize_mpi():

 def initialize_nccl(comm_mpi, rank, size):
    """Initialize NCCL communication."""
+    _require_cuquantum()
    nccl_id = nccl.get_unique_id() if rank == 0 else None
    nccl_id = comm_mpi.bcast(nccl_id, root=0)
    return nccl.NcclCommunicator(size, nccl_id, rank)
@@ -179,6 +108,7 @@ def get_operands(qibo_circ, datatype, rank, comm):

 def compute_optimal_path(network, n_samples, size, comm):
    """Compute contraction path and broadcast optimal selection."""
+    _require_cuquantum()
    path, info = network.contract_path(
        optimize={
            "samples": n_samples,
@@ -207,6 +137,8 @@ def compute_slices(info, rank, size):

 def reduce_result(result, comm, method="MPI", root=0):
    """Reduce results across processes."""
+    if method == "NCCL":
+        _require_cuquantum()
    if method == "MPI":
        return comm.reduce(sendobj=result, op=MPI.SUM, root=root)

@@ -254,6 +186,7 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    Returns:
        Dense vector of quantum circuit.
    """
+    _require_cuquantum()
    comm, rank, size, device_id = initialize_mpi()
    operands = get_operands(qibo_circ, datatype, rank, comm)
    network = Network(*operands, options={"device_id": device_id})
@@ -285,6 +218,7 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    Returns:
        Dense vector of quantum circuit.
    """
+    _require_cuquantum()
    comm_mpi, rank, size, device_id = initialize_mpi()
    comm_nccl = initialize_nccl(comm_mpi, rank, size)
    operands = get_operands(qibo_circ, datatype, rank, comm_mpi)
@@ -309,6 +243,7 @@ def dense_vector_tn(qibo_circ, datatype):
    Returns:
        Dense vector of quantum circuit.
    """
+    _require_cuquantum()
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
    return contract(*myconvertor.state_vector_operands())

@@ -337,6 +272,7 @@ def expectation_tn_nccl(qibo_circ, datatype, observable, n_samples=8):
        Expectation of quantum circuit due to pauli string.
    """

+    _require_cuquantum()
    comm_mpi, rank, size, device_id = initialize_mpi()

    comm_nccl = initialize_nccl(comm_mpi, rank, size)
@@ -405,6 +341,7 @@ def expectation_tn_MPI(qibo_circ, datatype, observable, n_samples=8):
    Returns:
        Expectation of quantum circuit due to pauli string.
    """
+    _require_cuquantum()
    # Initialize MPI and device
    comm, rank, size, device_id = initialize_mpi()

@@ -464,6 +401,7 @@ def expectation_tn(qibo_circ, datatype, observable):
    Returns:
        Expectation of quantum circuit due to pauli string.
    """
+    _require_cuquantum()
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)

    observable = check_observable(observable, qibo_circ.nqubits)
@@ -489,6 +427,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
    Returns:
        Dense vector of quantum circuit.
    """
+    _require_cuquantum()
    myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
    mps_helper = MPSContractionHelper(myconvertor.num_qubits)

--- a/src/qibotn/expectation_runner.py
+++ b/src/qibotn/expectation_runner.py
@@ -0,0 +1,322 @@
+"""High-level CPU expectation runner used by CLI scripts."""
+
+from __future__ import annotations
+
+import time
+from dataclasses import dataclass
+
+import numpy as np
+from qibo.backends import construct_backend
+
+from qibotn.benchmark_cases import (
+    CIRCUITS,
+    OBSERVABLES,
+    build_circuit,
+    exact_pauli_sum,
+    observable_terms,
+    parse_names,
+    terms_to_dict,
+)
+from qibotn.observables import check_observable
+
+
+def cpu_runcard(
+    observable=None,
+    *,
+    ansatz: str = "tn",
+    mpi: bool = False,
+    bond: int | None = 1024,
+    cut_ratio: float | None = 1e-12,
+    tensor_module: str = "torch",
+    quimb_backend: str = "torch",
+    dtype: str = "complex128",
+    torch_threads: int | None = 8,
+    parallel_opts: dict | None = None,
+    compile_circuit: bool = False,
+    preprocess: bool = False,
+):
+    """Build the small CPU backend runcard used throughout qibotn."""
+    return {
+        "MPI_enabled": mpi,
+        "MPS_enabled": ansatz.lower() == "mps",
+        "NCCL_enabled": False,
+        "expectation_enabled": observable if observable is not None else False,
+        "max_bond_dimension": bond,
+        "cut_ratio": cut_ratio,
+        "tensor_module": tensor_module,
+        "quimb_backend": quimb_backend,
+        "dtype": dtype,
+        "torch_threads": torch_threads,
+        "parallel_opts": parallel_opts or {},
+        "compile_circuit": compile_circuit,
+        "preprocess": preprocess,
+    }
+
+
+def cpu_backend(**kwargs):
+    """Return a configured qibotn CPU backend.
+
+    Example:
+        ``backend = cpu_backend(ansatz="mps", bond=512, torch_threads=8)``
+    """
+    from qibotn.backends.cpu import CpuTensorNet
+
+    return CpuTensorNet(cpu_runcard(**kwargs))
+
+
+@dataclass
+class ExpectationConfig:
+    ansatz: str = "tn"
+    mpi: bool = False
+    bond: int | None = 1024
+    cut_ratio: float | None = 1e-12
+    tensor_module: str = "torch"
+    quimb_backend: str = "torch"
+    dtype: str = "complex128"
+    torch_threads: int = 8
+    parallel_opts: dict | None = None
+
+
+@dataclass
+class ExpectationResult:
+    value: float
+    seconds: float
+    rank: int = 0
+    parallel_stats: list | None = None
+
+
+@dataclass
+class BenchmarkExpectationRecord:
+    circuit: str
+    observable: str
+    value: float
+    seconds: float
+    exact: float | None = None
+    abs_error: float | None = None
+    rel_error: float | None = None
+    parallel_stats: list | None = None
+
+
+def _config_from_kwargs(**kwargs):
+    fields = ExpectationConfig.__dataclass_fields__
+    config_kwargs = {name: kwargs.pop(name) for name in list(kwargs) if name in fields}
+    if kwargs:
+        unknown = ", ".join(sorted(kwargs))
+        raise TypeError(f"Unknown expectation option(s): {unknown}")
+    return ExpectationConfig(**config_kwargs)
+
+
+def exact_for_observable(circuit, observable, nqubits):
+    if isinstance(observable, dict) and "terms" in observable:
+        terms = [
+            (
+                term["coefficient"],
+                tuple((name, site) for name, site in term["operators"]),
+            )
+            for term in observable["terms"]
+        ]
+        return exact_pauli_sum(circuit, terms, nqubits)
+
+    hamiltonian = check_observable(observable, nqubits)
+    return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real)
+
+
+def run_cpu_expectation(circuit, observable, config):
+    runcard = cpu_runcard(
+        observable,
+        ansatz=config.ansatz,
+        mpi=config.mpi,
+        bond=config.bond,
+        cut_ratio=config.cut_ratio,
+        tensor_module=config.tensor_module,
+        quimb_backend=config.quimb_backend,
+        dtype=config.dtype,
+        torch_threads=config.torch_threads,
+        parallel_opts=config.parallel_opts,
+    )
+    backend = construct_backend(
+        backend="qibotn",
+        platform="cpu",
+        runcard=runcard,
+    )
+
+    start = time.perf_counter()
+    value = backend.execute_circuit(circuit)[0]
+    elapsed = time.perf_counter() - start
+
+    rank = getattr(backend, "rank", 0)
+    stats = getattr(backend, "parallel_stats", None)
+    return ExpectationResult(
+        float(np.real(value)),
+        elapsed,
+        rank=rank,
+        parallel_stats=list(stats) if stats is not None else None,
+    )
+
+
+def cpu_expectation(circuit, observable=None, *, return_result=False, **kwargs):
+    """Compute a CPU TN/MPS expectation with concise keyword options.
+
+    This is the preferred API for small scripts.  Common options are
+    ``ansatz="tn" | "mps"``, ``bond``, ``cut_ratio``, ``mpi``,
+    ``torch_threads``, ``quimb_backend`` and ``parallel_opts``.
+    """
+    config = _config_from_kwargs(**kwargs)
+    result = run_cpu_expectation(circuit, observable, config)
+    return result if return_result else result.value
+
+
+def mps_expectation(circuit, observable=None, *, return_result=False, **kwargs):
+    """Compute expectation using the CPU Vidal/MPS path when possible."""
+    kwargs.setdefault("ansatz", "mps")
+    return cpu_expectation(
+        circuit,
+        observable,
+        return_result=return_result,
+        **kwargs,
+    )
+
+
+def cpu_benchmark_parallel_opts(
+    *,
+    target_slices=None,
+    target_size=2**32,
+    search_workers=None,
+    torch_threads=8,
+    search_repeats=128,
+    search_time=60.0,
+    search_backend="dask",
+    dask_address=None,
+    dask_close_workers=False,
+    save_tree_path=None,
+    load_tree_path=None,
+    search_only=False,
+    debug_trials=False,
+    contract_implementation=None,
+    print_stats=True,
+):
+    """Build parallel TN options for the CPU expectation backend."""
+    slicing_opts = {}
+    if target_slices is not None:
+        slicing_opts["target_slices"] = target_slices
+    if target_size is not None:
+        slicing_opts["target_size"] = target_size
+
+    opts = {
+        "slicing_opts": slicing_opts or None,
+        "search_workers": search_workers or torch_threads,
+        "max_repeats": search_repeats,
+        "max_time": search_time,
+        "print_stats": print_stats,
+    }
+    if search_backend is not None:
+        opts["search_backend"] = search_backend
+    if dask_address is not None:
+        opts["dask_address"] = dask_address
+    if save_tree_path is not None:
+        opts["save_tree_path"] = save_tree_path
+    if load_tree_path is not None:
+        opts["load_tree_path"] = load_tree_path
+    if search_only:
+        opts["search_only"] = True
+    if debug_trials:
+        opts["debug_trials"] = True
+    if contract_implementation is not None:
+        opts["contract_implementation"] = contract_implementation
+    if dask_close_workers:
+        opts["dask_close_workers"] = True
+    return opts
+
+
+def run_cpu_benchmark_cases(
+    *,
+    nqubits=40,
+    nlayers=30,
+    bond=1024,
+    cut_ratio=1e-12,
+    seed=42,
+    torch_threads=8,
+    quimb_backend="torch",
+    dtype="complex128",
+    ansatz="tn",
+    mpi=False,
+    exact=False,
+    exact_max_qubits=24,
+    circuits=("brickwall_cnot",),
+    observables=("ring_xz",),
+    pauli_pattern=None,
+    parallel_opts=None,
+):
+    """Run the reusable CPU TN/MPS benchmark cases.
+
+    This is the importable library entrypoint for reusable CPU benchmark cases.
+    """
+    selected_circuits = parse_names(list(circuits), CIRCUITS, "circuits")
+    selected_observables = (
+        []
+        if pauli_pattern
+        else parse_names(list(observables), OBSERVABLES, "observables")
+    )
+
+    rank = 0
+    if mpi:
+        from mpi4py import MPI
+
+        rank = MPI.COMM_WORLD.Get_rank()
+
+    config = ExpectationConfig(
+        ansatz=ansatz,
+        mpi=mpi,
+        bond=bond,
+        cut_ratio=cut_ratio,
+        tensor_module="torch",
+        quimb_backend=quimb_backend,
+        dtype=dtype,
+        torch_threads=torch_threads,
+        parallel_opts=parallel_opts or {},
+    )
+
+    records = []
+    for circuit_kind in selected_circuits:
+        circuit = build_circuit(circuit_kind, nqubits, nlayers, seed)
+        named_observables = (
+            [(f"pattern:{pauli_pattern}", {"pauli_string_pattern": pauli_pattern})]
+            if pauli_pattern
+            else [
+                (obs_kind, terms_to_dict(observable_terms(obs_kind, nqubits)))
+                for obs_kind in selected_observables
+            ]
+        )
+
+        for obs_name, observable in named_observables:
+            exact_value = None
+            if exact and rank == 0:
+                if nqubits > exact_max_qubits:
+                    raise ValueError(
+                        f"exact reference is limited to {exact_max_qubits} qubits."
+                    )
+                exact_value = exact_for_observable(circuit, observable, nqubits)
+
+            result = run_cpu_expectation(circuit, observable, config)
+            if mpi and result.rank != 0:
+                continue
+
+            abs_error = None if exact_value is None else abs(result.value - exact_value)
+            rel_error = (
+                None
+                if exact_value is None
+                else abs_error / max(abs(exact_value), 1e-15)
+            )
+            records.append(
+                BenchmarkExpectationRecord(
+                    circuit=circuit_kind,
+                    observable=obs_name,
+                    value=result.value,
+                    seconds=result.seconds,
+                    exact=exact_value,
+                    abs_error=abs_error,
+                    rel_error=rel_error,
+                    parallel_stats=result.parallel_stats,
+                )
+            )
+    return records
--- a/src/qibotn/mps_contraction_helper.py
+++ b/src/qibotn/mps_contraction_helper.py
@@ -1,118 +0,0 @@
-from cuquantum.tensornet import contract, contract_path
-
-# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
-
-
-class MPSContractionHelper:
-    """A helper class to compute various quantities for a given MPS.
-
-    Interleaved format is used to construct the input args for `cuquantum.contract`.
-
-    Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
-
-    The following compute quantities are supported:
-
-        - the norm of the MPS.
-        - the equivalent state vector from the MPS.
-        - the expectation value for a given operator.
-        - the equivalent state vector after multiplying an MPO to an MPS.
-
-    Parameters:
-        num_qubits: The number of qubits for the MPS.
-    """
-
-    def __init__(self, num_qubits):
-        self.num_qubits = num_qubits
-        self.bra_modes = [(2 * i, 2 * i + 1, 2 * i + 2) for i in range(num_qubits)]
-        offset = 2 * num_qubits + 1
-        self.ket_modes = [
-            (i + offset, 2 * i + 1, i + 1 + offset) for i in range(num_qubits)
-        ]
-
-    def contract_norm(self, mps_tensors, options=None):
-        """Contract the corresponding tensor network to form the norm of the
-        MPS.
-
-        Parameters:
-            mps_tensors: A list of rank-3 ndarray-like tensor objects.
-                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
-                the physical mode, and then the bonding index to the i+1th tensor.
-            options: Specify the contract and decompose options.
-
-        Returns:
-            The norm of the MPS.
-        """
-        interleaved_inputs = []
-        for i, o in enumerate(mps_tensors):
-            interleaved_inputs.extend(
-                [o, self.bra_modes[i], o.conj(), self.ket_modes[i]]
-            )
-        interleaved_inputs.append([])  # output
-        return self._contract(interleaved_inputs, options=options).real
-
-    def contract_state_vector(self, mps_tensors, options=None):
-        """Contract the corresponding tensor network to form the state vector
-        representation of the MPS.
-
-        Parameters:
-            mps_tensors: A list of rank-3 ndarray-like tensor objects.
-                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
-                the physical mode, and then the bonding index to the i+1th tensor.
-            options: Specify the contract and decompose options.
-
-        Returns:
-            An ndarray-like object as the state vector.
-        """
-        interleaved_inputs = []
-        for i, o in enumerate(mps_tensors):
-            interleaved_inputs.extend([o, self.bra_modes[i]])
-        output_modes = tuple([bra_modes[1] for bra_modes in self.bra_modes])
-        interleaved_inputs.append(output_modes)  # output
-        return self._contract(interleaved_inputs, options=options)
-
-    def contract_expectation(
-        self, mps_tensors, operator, qubits, options=None, normalize=False
-    ):
-        """Contract the corresponding tensor network to form the expectation of
-        the MPS.
-
-        Parameters:
-            mps_tensors: A list of rank-3 ndarray-like tensor objects.
-                The indices of the ith tensor are expected to be bonding index to the i-1 tensor,
-                the physical mode, and then the bonding index to the i+1th tensor.
-            operator: A ndarray-like tensor object.
-                The modes of the operator are expected to be output qubits followed by input qubits, e.g,
-                ``A, B, a, b`` where `a, b` denotes the inputs and `A, B'` denotes the outputs.
-            qubits: A sequence of integers specifying the qubits that the operator is acting on.
-            options: Specify the contract and decompose options.
-            normalize: Whether to scale the expectation value by the normalization factor.
-
-        Returns:
-            An ndarray-like object as the state vector.
-        """
-
-        interleaved_inputs = []
-        extra_mode = 3 * self.num_qubits + 2
-        operator_modes = [None] * len(qubits) + [self.bra_modes[q][1] for q in qubits]
-        qubits = list(qubits)
-        for i, o in enumerate(mps_tensors):
-            interleaved_inputs.extend([o, self.bra_modes[i]])
-            k_modes = self.ket_modes[i]
-            if i in qubits:
-                k_modes = (k_modes[0], extra_mode, k_modes[2])
-                q = qubits.index(i)
-                operator_modes[q] = extra_mode  # output modes
-                extra_mode += 1
-            interleaved_inputs.extend([o.conj(), k_modes])
-        interleaved_inputs.extend([operator, tuple(operator_modes)])
-        interleaved_inputs.append([])  # output
-        if normalize:
-            norm = self.contract_norm(mps_tensors, options=options)
-        else:
-            norm = 1
-        return self._contract(interleaved_inputs, options=options) / norm
-
-    def _contract(self, interleaved_inputs, options=None):
-        path = contract_path(*interleaved_inputs, options=options)[0]
-
-        return contract(*interleaved_inputs, options=options, optimize={"path": path})
--- a/src/qibotn/mps_utils.py
+++ b/src/qibotn/mps_utils.py
@@ -1,95 +0,0 @@
-import cupy as cp
-from cuquantum.tensornet import contract
-from cuquantum.tensornet.experimental import contract_decompose
-
-
-def initial(num_qubits, dtype):
-    r"""Generate the MPS with an initial state of :math:`\ket{00...00}`
-
-    Parameters:
-        num_qubits: Number of qubits in the Quantum Circuit.
-        dtype: Either single ("complex64") or double (complex128) precision.
-
-    Returns:
-        The initial MPS tensors.
-    """
-    state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
-    mps_tensors = [state_tensor] * num_qubits
-    return mps_tensors
-
-
-def mps_site_right_swap(mps_tensors, i, **kwargs):
-    """Perform the swap operation between the ith and i+1th MPS tensors.
-
-    Parameters:
-        mps_tensors: Tensors representing MPS
-        i (int): index of the tensor to swap
-
-    Returns:
-        The updated MPS tensors.
-    """
-    # contraction followed by QR decomposition
-    a, _, b = contract_decompose(
-        "ipj,jqk->iqj,jpk",
-        *mps_tensors[i : i + 2],
-        algorithm=kwargs.get("algorithm", None),
-        options=kwargs.get("options", None),
-    )
-    mps_tensors[i : i + 2] = (a, b)
-    return mps_tensors
-
-
-def apply_gate(mps_tensors, gate, qubits, **kwargs):
-    """Apply the gate operand to the MPS tensors in-place.
-
-    # Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
-
-    Parameters:
-        mps_tensors: A list of rank-3 ndarray-like tensor objects.
-            The indices of the ith tensor are expected to be the bonding index to the i-1 tensor,
-            the physical mode, and then the bonding index to the i+1th tensor.
-        gate: A ndarray-like tensor object representing the gate operand.
-            The modes of the gate is expected to be output qubits followed by input qubits, e.g,
-            ``A, B, a, b`` where ``a, b`` denotes the inputs and ``A, B`` denotes the outputs.
-        qubits: A sequence of integers denoting the qubits that the gate is applied onto.
-        algorithm: The contract and decompose algorithm to use for gate application.
-            Can be either a `dict` or a `ContractDecomposeAlgorithm`.
-        options: Specify the contract and decompose options.
-
-    Returns:
-        The updated MPS tensors.
-    """
-
-    n_qubits = len(qubits)
-    if n_qubits == 1:
-        # single-qubit gate
-        i = qubits[0]
-        mps_tensors[i] = contract(
-            "ipj,qp->iqj", mps_tensors[i], gate, options=kwargs.get("options", None)
-        )  # in-place update
-    elif n_qubits == 2:
-        # two-qubit gate
-        i, j = qubits
-        if i > j:
-            # swap qubits order
-            return apply_gate(mps_tensors, gate.transpose(1, 0, 3, 2), (j, i), **kwargs)
-        elif i + 1 == j:
-            # two adjacent qubits
-            a, _, b = contract_decompose(
-                "ipj,jqk,rspq->irj,jsk",
-                *mps_tensors[i : i + 2],
-                gate,
-                algorithm=kwargs.get("algorithm", None),
-                options=kwargs.get("options", None),
-            )
-            mps_tensors[i : i + 2] = (a, b)  # in-place update
-        else:
-            # non-adjacent two-qubit gate
-            # step 1: swap i with i+1
-            mps_site_right_swap(mps_tensors, i, **kwargs)
-            # step 2: apply gate to (i+1, j) pair. This amounts to a recursive swap until the two qubits are adjacent
-            apply_gate(mps_tensors, gate, (i + 1, j), **kwargs)
-            # step 3: swap back i and i+1
-            mps_site_right_swap(mps_tensors, i, **kwargs)
-    else:
-        raise NotImplementedError("Only one- and two-qubit gates supported")
--- a/src/qibotn/observables.py
+++ b/src/qibotn/observables.py
@@ -0,0 +1,155 @@
+"""Observable helpers shared by tensor-network backends and benchmarks."""
+
+from qibo import hamiltonians
+from qibo.symbols import I, X, Y, Z
+
+
+def pauli_pattern(pattern):
+    """Return the compact qibotn representation of a repeated Pauli string."""
+    return {"pauli_string_pattern": pattern}
+
+
+def pauli_sum(*terms):
+    """Return the compact qibotn representation of a Pauli sum.
+
+    Each term is ``(coefficient, operators)`` where operators are pairs like
+    ``("X", 0)``.  Example:
+
+    ``pauli_sum((0.5, [("X", 0), ("Z", 1)]), (-1.0, [("Z", 3)]))``
+    """
+    return {
+        "terms": [
+            {
+                "coefficient": coeff,
+                "operators": [(name, int(site)) for name, site in operators],
+            }
+            for coeff, operators in terms
+        ]
+    }
+
+
+def check_observable(observable, circuit_nqubit):
+    """Checks the type of observable and returns the appropriate Hamiltonian."""
+    if observable is None:
+        return build_observable(circuit_nqubit)
+    if isinstance(observable, dict):
+        return create_hamiltonian_from_dict(observable, circuit_nqubit)
+    if isinstance(observable, hamiltonians.SymbolicHamiltonian):
+        if observable.nqubits == circuit_nqubit:
+            return observable
+        if observable.nqubits > circuit_nqubit:
+            raise ValueError(
+                "Observable has more qubits than the circuit: "
+                f"{observable.nqubits} > {circuit_nqubit}."
+            )
+        return hamiltonians.SymbolicHamiltonian(
+            form=observable.form,
+            nqubits=circuit_nqubit,
+        )
+    try:
+        return hamiltonians.SymbolicHamiltonian(form=observable)
+    except Exception as exc:
+        raise TypeError("Invalid observable type.") from exc
+
+
+def build_observable(circuit_nqubit):
+    """Construct the default benchmark observable used by qibotn."""
+    form = sum(
+        0.5 * X(i) * Z((i + 1) % circuit_nqubit) for i in range(circuit_nqubit)
+    )
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def create_hamiltonian_from_dict(data, circuit_nqubit):
+    """Create a Qibo SymbolicHamiltonian from the qibotn dict representation."""
+    if "pauli_string_pattern" in data:
+        return create_hamiltonian_from_pauli_pattern(
+            data["pauli_string_pattern"], circuit_nqubit
+        )
+
+    pauli_gates = {"X": X, "Y": Y, "Z": Z}
+    terms = []
+
+    for term in data["terms"]:
+        coeff = term["coefficient"]
+        operators = term["operators"]
+        operator_dict = {q: pauli_gates[g] for g, q in operators}
+
+        full_term_expr = [
+            operator_dict[q](q) if q in operator_dict else I(q)
+            for q in range(circuit_nqubit)
+        ]
+
+        term_expr = full_term_expr[0]
+        for op in full_term_expr[1:]:
+            term_expr *= op
+        terms.append(coeff * term_expr)
+
+    if not terms:
+        raise ValueError("No valid Hamiltonian terms were added.")
+
+    return hamiltonians.SymbolicHamiltonian(sum(terms))
+
+
+def create_hamiltonian_from_pauli_pattern(pattern, circuit_nqubit):
+    """Create a single Pauli-string Hamiltonian by repeating ``pattern``.
+
+    Example: pattern ``"IXZ"`` on 5 qubits becomes ``I0 * X1 * Z2 * I3 * X4``.
+    Identity factors are omitted except for the all-identity case.
+    """
+    if not isinstance(pattern, str) or not pattern:
+        raise ValueError("pauli_string_pattern must be a non-empty string.")
+
+    pauli_gates = {"X": X, "Y": Y, "Z": Z}
+    pattern = pattern.upper()
+    invalid = sorted(set(pattern) - {"I", "X", "Y", "Z"})
+    if invalid:
+        raise ValueError(
+            "pauli_string_pattern characters must be one of I/X/Y/Z; "
+            f"got {''.join(invalid)!r}."
+        )
+
+    expr = None
+    for qubit in range(circuit_nqubit):
+        name = pattern[qubit % len(pattern)]
+        if name == "I":
+            continue
+        factor = pauli_gates[name](qubit)
+        expr = factor if expr is None else expr * factor
+    return hamiltonians.SymbolicHamiltonian(form=expr or I(0))
+
+
+def build_random_circuit(nqubits, nlayers, seed=42):
+    """Build a random circuit with RY+RZ+CNOT layers for benchmarks."""
+    import numpy as np
+    from qibo import Circuit, gates
+
+    rng = np.random.default_rng(seed)
+    c = Circuit(nqubits)
+    for _ in range(nlayers):
+        for q in range(nqubits):
+            c.add(gates.RY(q, theta=rng.uniform(0, 2 * np.pi)))
+            c.add(gates.RZ(q, theta=rng.uniform(0, 2 * np.pi)))
+        for q in range(nqubits):
+            c.add(gates.CNOT(q % nqubits, (q + 1) % nqubits))
+    return c
+
+
+def extract_gates_and_qubits(hamiltonian):
+    """Extract per-term Pauli factors from a Qibo SymbolicHamiltonian.
+
+    Returns list of terms, where each term is (coefficient, [(qubit, gate_name), ...]).
+    """
+    extracted_terms = []
+
+    if not isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
+        raise ValueError(
+            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
+        )
+
+    for term in hamiltonian.terms:
+        coeff = term.coefficient
+        factors = [(int(str(f)[1:]), str(f)[0]) for f in term.factors]
+        extracted_terms.append((coeff, factors))
+
+    return extracted_terms
--- a/src/qibotn/parallel.py
+++ b/src/qibotn/parallel.py
--- a/src/qibotn/result.py
+++ b/src/qibotn/result.py
@@ -32,20 +32,19 @@ class TensorNetworkResult:
    statevector: ndarray

    def __post_init__(self):
-        # TODO: define the general convention when using backends different from qmatchatea
        if self.measured_probabilities is None:
-            self.measured_probabilities = {"default": self.measured_probabilities}
+            self.measured_probabilities = {}

    def probabilities(self):
        """Return calculated probabilities according to the given method."""
-        if self.prob_type == "U":
-            measured_probabilities = deepcopy(self.measured_probabilities)
-            for bitstring, prob in self.measured_probabilities[self.prob_type].items():
-                measured_probabilities[self.prob_type][bitstring] = prob[1] - prob[0]
-            probabilities = measured_probabilities[self.prob_type]
-        else:
-            probabilities = self.measured_probabilities
-        return probabilities
+        if self.prob_type != "U":
+            return self.measured_probabilities
+
+        measured_probabilities = deepcopy(self.measured_probabilities)
+        values = measured_probabilities.get(self.prob_type, {})
+        for bitstring, prob in values.items():
+            values[bitstring] = prob[1] - prob[0]
+        return values

    def frequencies(self):
        """Return frequencies if a certain number of shots has been set."""
@@ -57,10 +56,10 @@ class TensorNetworkResult:
        return self.measures

    def state(self):
-        """Return the statevector if the number of qubits is less than 20."""
-        if self.nqubits < 20:
+        """Return the statevector if the number of qubits is less than 35."""
+        if self.nqubits < 35:
            return self.statevector
        raise_error(
            NotImplementedError,
-            f"Tensor network simulation cannot be used to reconstruct statevector for >= 20 .",
+            f"Tensor network simulation cannot be used to reconstruct statevector for >= 35 .",
        )
--- a/src/qibotn/torch_contractor.py
+++ b/src/qibotn/torch_contractor.py
@@ -0,0 +1,252 @@
+"""Torch C++ contraction backend for cotengra trees.
+
+This module compiles a restricted cotengra contraction tree into a compact
+execution plan, then executes that plan in a C++ torch extension. It is an
+experimental CPU path for reducing Python-level overhead between many
+pairwise contractions.
+"""
+
+from __future__ import annotations
+
+import importlib
+import os
+from functools import lru_cache
+from pathlib import Path
+from collections import defaultdict
+
+
+_EXTENSION = None
+_CONTRACTORS = {}
+SMALL_GEMM_BATCH_FLOPS = 1_000_000
+
+
+def _load_extension():
+    global _EXTENSION
+    if _EXTENSION is not None:
+        return _EXTENSION
+
+    from torch.utils.cpp_extension import load
+
+    source = Path(__file__).resolve().parent / "csrc" / "torch_contractor.cpp"
+    mklroot = os.environ.get("MKLROOT")
+    extra_cflags = ["-O3"]
+    extra_ldflags = []
+    extra_include_paths = []
+    if mklroot:
+        mklroot_path = Path(mklroot)
+        mkl_include = mklroot_path / "include"
+        mkl_lib = mklroot_path / "lib"
+        if (mkl_include / "mkl_cblas.h").exists() and (
+            (mkl_lib / "libmkl_rt.so").exists()
+            or (mkl_lib / "libmkl_rt.so.2").exists()
+        ):
+            extra_cflags.append("-DQIBOTN_USE_MKL")
+            extra_include_paths.append(str(mkl_include))
+            extra_ldflags.extend([f"-L{mkl_lib}", "-lmkl_rt"])
+
+    _EXTENSION = load(
+        name="qibotn_torch_contractor",
+        sources=[str(source)],
+        extra_cflags=extra_cflags,
+        extra_ldflags=extra_ldflags,
+        extra_include_paths=extra_include_paths,
+        verbose=False,
+    )
+    return _EXTENSION
+
+
+def _is_plain_permutation(expr):
+    if expr is None:
+        return None
+    if isinstance(expr, tuple):
+        return tuple(int(i) for i in expr)
+    if not isinstance(expr, str):
+        return None
+    if "," in expr or "->" not in expr:
+        return None
+    source, target = expr.split("->", 1)
+    if len(source) != len(target):
+        return None
+    if len(set(source)) != len(source) or set(source) != set(target):
+        return None
+    return tuple(source.index(ix) for ix in target)
+
+
+def _maybe_tuple(values):
+    return () if values is None else tuple(int(x) for x in values)
+
+
+def _shape_from_inds(tree, node):
+    return tuple(int(tree.size_dict[ix]) for ix in tree.get_inds(node))
+
+
+def _matmul_signature(op):
+    kind = op[3]
+    if kind != 0:
+        return None
+    left_shape = op[5]
+    right_shape = op[7]
+    if len(left_shape) == 2 and len(right_shape) == 2:
+        m, k, n = left_shape[-2], left_shape[-1], right_shape[-1]
+        return ("mm", int(m), int(k), int(n), int(m * k * n))
+    return None
+
+
+def _normalize_node_ids(tree, contractions):
+    leaf_to_id = {
+        frozenset((i,)): i
+        for i in range(tree.N)
+    }
+    next_id = len(leaf_to_id)
+    node_to_id = dict(leaf_to_id)
+    for parent, _left, _right, _tdot, _arg, _perm in contractions:
+        if parent not in node_to_id:
+            node_to_id[parent] = next_id
+            next_id += 1
+
+    return node_to_id, next_id
+
+
+@lru_cache(maxsize=32)
+def compile_torch_plan(tree):
+    """Compile ``tree`` into C++ contractor plan fields.
+
+    The supported subset is the same pairwise matmul lowering used by
+    cotengra for torch CPU. Single-tensor diagonal/sum preprocessing is not
+    supported yet because it appears only in less common trees; callers should
+    fall back to cotengra for those cases.
+    """
+
+    contract_mod = importlib.import_module("cotengra.contract")
+    contractions = contract_mod.extract_contractions(tree)
+    node_to_id, ntemps = _normalize_node_ids(tree, contractions)
+    plan = []
+
+    for parent, left, right, tdot, arg, perm in contractions:
+        if left is None or right is None:
+            raise NotImplementedError(
+                "C++ torch contractor does not support cotengra preprocessing."
+            )
+
+        left_shape = _shape_from_inds(tree, left)
+        right_shape = _shape_from_inds(tree, right)
+        if tdot:
+            parsed = contract_mod._parse_tensordot_axes_to_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+        else:
+            parsed = contract_mod._parse_eq_to_batch_matmul(
+                arg,
+                left_shape,
+                right_shape,
+            )
+
+        (
+            eq_a,
+            eq_b,
+            new_shape_a,
+            new_shape_b,
+            new_shape_ab,
+            perm_ab,
+            pure_multiplication,
+        ) = parsed
+
+        left_perm = _is_plain_permutation(eq_a)
+        right_perm = _is_plain_permutation(eq_b)
+        if left_perm is None and eq_a is not None:
+            raise NotImplementedError(f"Unsupported left preparation: {eq_a!r}")
+        if right_perm is None and eq_b is not None:
+            raise NotImplementedError(f"Unsupported right preparation: {eq_b!r}")
+
+        plan.append(
+            (
+                node_to_id[parent],
+                node_to_id[left],
+                node_to_id[right],
+                1 if pure_multiplication else 0,
+                left_perm or (),
+                _maybe_tuple(new_shape_a),
+                right_perm or (),
+                _maybe_tuple(new_shape_b),
+                _maybe_tuple(new_shape_ab),
+                _maybe_tuple(perm_ab),
+            )
+        )
+
+        if perm is not None:
+            raise NotImplementedError(
+                "C++ torch contractor does not support cotengra tensordot perm."
+            )
+
+    root_id = node_to_id[tree.root]
+    return tuple(plan), int(ntemps), int(root_id)
+
+
+@lru_cache(maxsize=32)
+def compile_batch_groups(tree, max_flops=SMALL_GEMM_BATCH_FLOPS):
+    plan, _ntemps, _root_id = compile_torch_plan(tree)
+    contractions = importlib.import_module("cotengra.contract").extract_contractions(tree)
+    node_to_id, _ntemps = _normalize_node_ids(tree, contractions)
+    depth = {frozenset((i,)): 0 for i in range(tree.N)}
+    tensor_depth = {i: 0 for i in range(tree.N)}
+    groups = defaultdict(list)
+
+    for op_index, (contract_op, contraction) in enumerate(zip(plan, contractions)):
+        parent, left, right, _tdot, _arg, _perm = contraction
+        d = max(depth[left], depth[right]) + 1
+        depth[parent] = d
+        tensor_depth[contract_op[0]] = d
+        sig = _matmul_signature(contract_op)
+        if sig is None:
+            continue
+        kind, m, k, n, flops = sig
+        if flops > max_flops:
+            continue
+        groups[(d, kind, m, k, n)].append(op_index)
+
+    batch_groups = tuple(
+        tuple(items)
+        for _key, items in sorted(groups.items(), key=lambda item: (item[0], item[1][0]))
+        if len(items) >= 2
+    )
+    return batch_groups
+
+
+def batch_group_summary(tree, max_flops=SMALL_GEMM_BATCH_FLOPS):
+    plan, _ntemps, _root_id = compile_torch_plan(tree)
+    groups = compile_batch_groups(tree, max_flops=max_flops)
+    covered = sum(len(group) for group in groups)
+    calls_saved = sum(len(group) - 1 for group in groups)
+    by_shape = []
+    for group in groups:
+        op = plan[group[0]]
+        sig = _matmul_signature(op)
+        by_shape.append((sig[1:4], len(group), group[:8]))
+    return {
+        "groups": len(groups),
+        "covered_ops": covered,
+        "calls_saved": calls_saved,
+        "by_shape": by_shape,
+    }
+
+
+def contract_tree_cpp(tree, arrays):
+    """Contract a cotengra tree using the experimental C++ torch contractor."""
+
+    contractor = prepare_torch_cpp_contractor(tree)
+    return contractor.contract(list(arrays))
+
+
+def prepare_torch_cpp_contractor(tree):
+    """Load the extension and compile ``tree`` without running contraction."""
+
+    ext = _load_extension()
+    key = id(tree)
+    contractor = _CONTRACTORS.get(key)
+    if contractor is None:
+        plan, ntemps, root_id = compile_torch_plan(tree)
+        contractor = ext.Contractor(list(plan), ntemps, root_id)
+        _CONTRACTORS[key] = contractor
+    return contractor
--- a/src/qibotn/torch_utils.py
+++ b/src/qibotn/torch_utils.py
@@ -0,0 +1,90 @@
+"""Shared torch helpers for qibotn CPU tensor-network code."""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+def torch_dtype(dtype):
+    """Return the torch dtype used by qibotn complex CPU contractions."""
+    import torch
+
+    if dtype in ("complex64", "single", np.complex64):
+        return torch.complex64
+    return torch.complex128
+
+
+def numpy_dtype(dtype):
+    """Return the numpy dtype matching qibotn's complex dtype names."""
+    if dtype in ("complex64", "single", np.complex64):
+        return np.complex64
+    return np.complex128
+
+
+def torch_cpu_array(data, dtype=None):
+    """Convert array-like data to a contiguous CPU torch tensor.
+
+    ``torch.from_numpy`` rejects negative strides and read-only arrays in common
+    quimb paths, so this helper normalizes both cases before handing data to
+    torch.
+    """
+    import torch
+
+    if isinstance(data, torch.Tensor):
+        tensor = data
+    else:
+        array = np.asarray(data)
+        if any(stride < 0 for stride in array.strides):
+            array = np.ascontiguousarray(array)
+        elif not array.flags.writeable:
+            array = array.copy()
+        tensor = torch.from_numpy(array)
+
+    if tensor.device.type != "cpu":
+        tensor = tensor.cpu()
+    target_dtype = torch_dtype(dtype) if isinstance(dtype, str) else dtype
+    if target_dtype is not None and tensor.dtype != target_dtype:
+        tensor = tensor.to(target_dtype)
+    if not tensor.is_contiguous():
+        tensor = tensor.contiguous()
+    return tensor
+
+
+def arrays_to_torch(arrays, dtype="complex128"):
+    """Convert an iterable of arrays to CPU torch tensors."""
+    target_dtype = torch_dtype(dtype)
+    return [torch_cpu_array(array, dtype=target_dtype) for array in arrays]
+
+
+def arrays_to_numpy(arrays, dtype="complex128"):
+    """Convert an iterable of arrays to numpy arrays with qibotn dtype names."""
+    target_dtype = numpy_dtype(dtype)
+    return [np.asarray(array, dtype=target_dtype) for array in arrays]
+
+
+def arrays_to_backend(arrays, backend, engine=None, dtype="complex128"):
+    """Convert arrays to the backend representation used by quimb/cotengra."""
+    if backend == "torch":
+        return arrays_to_torch(arrays, dtype=dtype)
+    if engine is not None:
+        return [engine.asarray(array, dtype=numpy_dtype(dtype)) for array in arrays]
+    return arrays_to_numpy(arrays, dtype=dtype)
+
+
+def set_torch_threads(nthreads=None, interop_threads=None):
+    """Set torch CPU thread counts and return the active intra-op thread count."""
+    import torch
+
+    if nthreads is not None:
+        torch.set_num_threads(max(1, int(nthreads)))
+    if interop_threads is not None:
+        try:
+            torch.set_num_interop_threads(max(1, int(interop_threads)))
+        except RuntimeError:
+            pass
+    return torch.get_num_threads()
+
+
+def is_torch_array(value):
+    """Return whether *value* looks like a torch tensor without importing torch."""
+    return type(value).__module__.startswith("torch")
--- a/tests/test_cpu_backend.py
+++ b/tests/test_cpu_backend.py
@@ -0,0 +1,248 @@
+import math
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import X, Z
+
+from qibotn.backends.cpu import CpuTensorNet
+from qibotn.benchmark_cases import (
+    build_circuit as build_benchmark_circuit,
+    exact_pauli_sum,
+)
+from qibotn import cpu_expectation, mps_expectation, pauli_pattern, pauli_sum
+from qibotn.backends.quimb import (
+    build_expectation_tn,
+    contract_tn,
+    search_contraction_tree,
+)
+
+
+def build_circuit(nqubits=6):
+    circuit = Circuit(nqubits)
+    for qubit in range(nqubits):
+        circuit.add(gates.RY(qubit, theta=0.1 * (qubit + 1)))
+        circuit.add(gates.RZ(qubit, theta=-0.05 * (qubit + 1)))
+    for qubit in range(nqubits - 1):
+        circuit.add(gates.CNOT(qubit, qubit + 1))
+    return circuit
+
+
+def build_observable(nqubits):
+    form = 0
+    for qubit in range(nqubits):
+        form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
+    return hamiltonians.SymbolicHamiltonian(form=form)
+
+
+def test_cpu_generic_tn_expectation_matches_statevector():
+    circuit = build_circuit()
+    observable = build_observable(circuit.nqubits)
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": False,
+            "NCCL_enabled": False,
+            "expectation_enabled": observable,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_public_cpu_expectation_api_matches_statevector():
+    circuit = build_circuit()
+    observable = pauli_sum((0.5, [("X", 0), ("Z", 1)]), (-0.25, [("Z", 5)]))
+    exact = exact_pauli_sum(
+        circuit,
+        [(0.5, (("X", 0), ("Z", 1))), (-0.25, (("Z", 5),))],
+        circuit.nqubits,
+    )
+
+    value = cpu_expectation(circuit, observable, torch_threads=1)
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_public_quimb_torch_pipeline_matches_statevector():
+    circuit = build_circuit(nqubits=4)
+    observable = hamiltonians.SymbolicHamiltonian(form=X(0) * Z(1))
+    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
+
+    built = build_expectation_tn(
+        circuit,
+        observable,
+        dtype="complex128",
+        merge_1q=True,
+        merge_2q=True,
+    )
+    search = search_contraction_tree(
+        built.tn,
+        method="serial",
+        total_repeats=1,
+        max_time=30,
+        n_workers=1,
+        search_seed=0,
+    )
+    value = built.coeff * complex(contract_tn(built.tn, search.tree))
+
+    assert math.isclose(value.real, exact, abs_tol=1e-12)
+
+
+def test_public_mps_expectation_api_accepts_pauli_pattern():
+    circuit = build_circuit()
+    exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
+        form=X(1) * Z(2) * X(4) * Z(5)
+    )
+    exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
+
+    value = mps_expectation(
+        circuit,
+        pauli_pattern("IXZ"),
+        bond=64,
+        torch_threads=1,
+    )
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_mps_expectation_matches_statevector():
+    circuit = build_circuit()
+    observable = build_observable(circuit.nqubits)
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": observable,
+            "max_bond_dimension": 64,
+            "tensor_module": "torch",
+            "torch_threads": 1,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_runcard_pauli_pattern_matches_statevector():
+    circuit = build_circuit()
+    observable = {"pauli_string_pattern": "IXZ"}
+    exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
+        form=X(1) * Z(2) * X(4) * Z(5)
+    )
+    exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
+
+    for mps_enabled in (False, True):
+        backend = CpuTensorNet(
+            {
+                "MPI_enabled": False,
+                "MPS_enabled": mps_enabled,
+                "NCCL_enabled": False,
+                "expectation_enabled": observable,
+                "max_bond_dimension": 64,
+                "tensor_module": "torch",
+                "torch_threads": 1,
+            }
+        )
+        value = backend.execute_circuit(circuit)[0]
+
+        assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_mps_sampling_uses_nshots():
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    for qubit in range(3):
+        circuit.add(gates.CNOT(qubit, qubit + 1))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": False,
+        }
+    )
+    result = backend.execute_circuit(circuit, nshots=100)
+
+    assert sum(result.frequencies().values()) == 100
+    assert set(result.frequencies()) <= {"0000", "1111"}
+
+
+def test_cpu_mps_mpo_expectation_matches_statevector():
+    circuit = build_circuit(nqubits=4)
+    x = np.array([[0, 1], [1, 0]], dtype=complex)
+    z = np.array([[1, 0], [0, -1]], dtype=complex)
+    i2 = np.eye(2, dtype=complex)
+    mpo = [
+        x.reshape(1, 2, 2, 1),
+        z.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+    ]
+    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": {"mpo_tensors": mpo},
+            "max_bond_dimension": 64,
+            "tensor_module": "torch",
+            "torch_threads": 1,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
+
+
+def test_cpu_mps_dense_observable_dict_matches_known_value():
+    circuit = Circuit(2)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 1))
+
+    bell = np.zeros((4, 4), dtype=complex)
+    bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": True,
+            "NCCL_enabled": False,
+            "expectation_enabled": {"matrix": bell, "qubits": [0, 1]},
+            "max_bond_dimension": 16,
+            "tensor_module": "torch",
+            "torch_threads": 1,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, 1.0, abs_tol=1e-12)
+
+
+def test_cpu_generic_tn_long_pauli_string_matches_statevector():
+    circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42)
+    observable = {"pauli_string_pattern": "XZ"}
+    exact_hamiltonian = hamiltonians.SymbolicHamiltonian(
+        form=X(0) * Z(1) * X(2) * Z(3) * X(4) * Z(5) * X(6) * Z(7) * X(8) * Z(9)
+    )
+    exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True))
+
+    backend = CpuTensorNet(
+        {
+            "MPI_enabled": False,
+            "MPS_enabled": False,
+            "NCCL_enabled": False,
+            "expectation_enabled": observable,
+        }
+    )
+    value = backend.execute_circuit(circuit)[0]
+
+    assert math.isclose(value, exact, abs_tol=1e-12)
--- a/tests/test_expectation.py
+++ b/tests/test_expectation.py
@@ -35,7 +35,7 @@ def test_observable_expval(backend, nqubits):
    numpy_backend = construct_backend("numpy")
    ham, ham_form = build_observable(nqubits)
    circ = build_circuit(nqubits=nqubits, nlayers=1)
-
+    
    exact_expval = numpy_backend.calculate_expectation_state(
        hamiltonian=ham,
        state=circ().state(),
--- a/tests/test_parallel.py
+++ b/tests/test_parallel.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+from qibotn.parallel import _split_repeats, contract_tree_slices, mpi_slice_plan
+
+
+def test_mpi_slice_plan_block_balances_contiguous_ranges():
+    plans = [mpi_slice_plan(10, rank, 4, assignment="block") for rank in range(4)]
+
+    assert [plan.indices for plan in plans] == [
+        (0, 1, 2),
+        (3, 4, 5),
+        (6, 7),
+        (8, 9),
+    ]
+
+
+def test_mpi_slice_plan_cyclic_balances_round_robin():
+    plans = [mpi_slice_plan(10, rank, 4, assignment="cyclic") for rank in range(4)]
+
+    assert [plan.indices for plan in plans] == [
+        (0, 4, 8),
+        (1, 5, 9),
+        (2, 6),
+        (3, 7),
+    ]
+
+
+class DummyTree:
+    def contract_slice(self, arrays, i, backend=None):
+        return arrays[0] * (i + 1)
+
+
+def test_contract_tree_slices_sums_numpy_slices():
+    result = contract_tree_slices(
+        DummyTree(),
+        [np.asarray([2.0 + 0.0j])],
+        (0, 2, 3),
+        backend="numpy",
+    )
+
+    np.testing.assert_allclose(result, np.asarray([16.0 + 0.0j]))
+
+
+def test_split_repeats_balances_workers():
+    assert _split_repeats(10, 4) == [3, 3, 2, 2]
+    assert _split_repeats(2, 4) == [1, 1]
--- a/tests/test_vidal_backend.py
+++ b/tests/test_vidal_backend.py
@@ -0,0 +1,400 @@
+import math
+
+import numpy as np
+from qibo import Circuit, gates, hamiltonians
+from qibo.symbols import Symbol, X, Y, Z
+
+from qibotn.benchmark_cases import exact_pauli_sum
+from qibotn.backends.vidal import (
+    VidalBackend,
+    _can_route_non_adjacent,
+    _unsupported_reason,
+    _operator_terms_to_mpo,
+    _symbolic_hamiltonian_to_operator_terms,
+)
+from qibotn.backends.vidal_tebd import (
+    VidalTEBDExecutor,
+    _route_non_adjacent_gates,
+    _gate_sites,
+)
+
+
+def build_local_circuit(nqubits=8, nlayers=3, seed=42):
+    rng = np.random.default_rng(seed)
+    circuit = Circuit(nqubits)
+    for layer in range(nlayers):
+        for q in range(nqubits):
+            circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
+            circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
+        for q in range(layer % 2, nqubits - 1, 2):
+            circuit.add(gates.CNOT(q, q + 1))
+    return circuit
+
+
+def test_vidal_backend_expectation_matches_statevector():
+    circuit = build_local_circuit()
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=0.5 * X(0) * Z(1) + 0.25 * Y(2) * Y(3) - 0.7 * Z(7)
+    )
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(max_bond_dimension=128, tensor_module="torch")
+    value = backend.expectation(circuit, observable)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_accepts_unlimited_bond_and_no_cutoff():
+    circuit = build_local_circuit(nqubits=6, nlayers=2)
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=0.5 * X(0) * Z(1) - 0.7 * Z(5)
+    )
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=None,
+        cut_ratio=None,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_fallback_for_non_adjacent_gate():
+    """compile_circuit=False (default) → falls back to qmatchatea for non-adjacent."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(max_bond_dimension=32, tensor_module="torch")
+    value = backend.expectation(circuit, observable)
+
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_routes_non_adjacent_with_compile():
+    """Non-adjacent gate with compile_circuit=True goes through Vidal SWAP routing."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=32, tensor_module="torch", compile_circuit=True,
+    )
+    value = backend.expectation(circuit, observable)
+
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_can_route_non_adjacent():
+    """_can_route_non_adjacent correctly identifies routable circuits."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+    assert _can_route_non_adjacent(circuit)
+
+    circuit.add(gates.CNOT(0, 1))
+    assert _can_route_non_adjacent(circuit)
+
+
+def test_cannot_route_multi_qubit():
+    """Circuits with 3+ qubit gates cannot be routed."""
+    circuit = Circuit(3)
+    circuit.add(gates.TOFFOLI(0, 1, 2))
+    assert not _can_route_non_adjacent(circuit)
+
+
+def test_routing_preserves_adjacent_gates():
+    """_route_non_adjacent_gates leaves adjacent gates unchanged."""
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    original = list(circuit.queue)
+    routed = _route_non_adjacent_gates(original, 4)
+
+    # Count 2Q gates — should be more due to inserted SWAPs, so just
+    # check that all 2-site gates ARE adjacent.
+    for gate in routed:
+        sites = _gate_sites(gate)
+        if len(sites) == 2:
+            diff = abs(sites[0] - sites[1])
+            assert diff == 1, f"Non-adjacent gate after routing: {gate.name} on {sites}"
+
+
+def test_routing_non_adjacent_cnot():
+    """Manually verify SWAP+CNOT+unSWAP for CNOT(0,3)."""
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.H(3))
+    circuit.add(gates.CNOT(0, 3))
+
+    routed = _route_non_adjacent_gates(list(circuit.queue), 4)
+
+    # Expected: H(0), H(3), SWAP(2,3), SWAP(1,2), routed CNOT on (0,1), SWAP(1,2), SWAP(2,3)
+    names = [getattr(g, "name", g.__class__.__name__) for g in routed]
+    assert names == ["h", "h", "swap", "swap", "routed_two_qubit", "swap", "swap"], f"Got {names}"
+
+    # Verify expectation through full pipeline
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=32, tensor_module="torch", compile_circuit=True,
+    )
+    value = backend.expectation(circuit, observable)
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_routing_preserves_reversed_non_adjacent_gate_order():
+    circuit = Circuit(6)
+    circuit.add(gates.X(5))
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(5, 0))
+
+    observable = hamiltonians.SymbolicHamiltonian(form=X(0) + Z(5) + Z(0) * Z(5))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        compile_circuit=True,
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_preprocesses_non_adjacent_circuit():
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 3))
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        compile_circuit=True,
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=True)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_backend_preprocesses_toffoli_locally():
+    circuit = Circuit(4)
+    circuit.add(gates.H(0))
+    circuit.add(gates.H(1))
+    circuit.add(gates.TOFFOLI(0, 1, 3))
+    observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=128,
+        tensor_module="torch",
+        compile_circuit=True,
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=True)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_expectation_preserves_complex_coefficients():
+    circuit = Circuit(1)
+    observable = hamiltonians.SymbolicHamiltonian(form=(1.0 + 2.0j) * Z(0))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=8,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, 1.0 + 2.0j, atol=1e-12)
+
+
+def test_vidal_expectation_supports_custom_local_symbols():
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    a0 = Symbol(0, np.array([[0.2, 1.0], [1.0, -0.3]], dtype=complex), name="A")
+    b2 = Symbol(2, np.array([[0.7, -0.4j], [0.4j, 0.1]], dtype=complex), name="B")
+    a3 = Symbol(3, np.array([[0.5, 0.2], [0.2, -0.8]], dtype=complex), name="A")
+    observable = hamiltonians.SymbolicHamiltonian(form=0.7 * a0 * b2 - 0.4 * a3)
+    exact = observable.expectation_from_state(circuit().state(numpy=True))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_executor_mpo_expectation_matches_pauli_sum():
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    executor = VidalTEBDExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=64,
+        tensor_module="torch",
+    )
+    executor.run_circuit(circuit)
+
+    x = np.array([[0, 1], [1, 0]], dtype=complex)
+    z = np.array([[1, 0], [0, -1]], dtype=complex)
+    i2 = np.eye(2, dtype=complex)
+    mpo = [
+        x.reshape(1, 2, 2, 1),
+        z.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+    ]
+    mpo_value = executor.expectation_mpo(mpo)
+    pauli_value = executor.expectation_pauli_sum([(1.0, (("X", 0), ("Z", 1)))])
+
+    np.testing.assert_allclose(mpo_value, pauli_value, atol=1e-12)
+
+
+def test_vidal_backend_accepts_mpo_observable_dict():
+    circuit = build_local_circuit(nqubits=4, nlayers=2)
+    x = np.array([[0, 1], [1, 0]], dtype=complex)
+    z = np.array([[1, 0], [0, -1]], dtype=complex)
+    i2 = np.eye(2, dtype=complex)
+    mpo = [
+        x.reshape(1, 2, 2, 1),
+        z.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+        i2.reshape(1, 2, 2, 1),
+    ]
+    exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, {"mpo_tensors": mpo}, preprocess=False)
+
+    np.testing.assert_allclose(value, exact, atol=1e-12)
+
+
+def test_vidal_symbolic_hamiltonian_auto_mpo_matches_operator_sum():
+    circuit = build_local_circuit(nqubits=5, nlayers=2)
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=0.3 * X(0) * Z(1) - 0.2j * Y(2) + 0.7 * Z(3) * X(4)
+    )
+
+    executor = VidalTEBDExecutor(
+        nqubits=circuit.nqubits,
+        max_bond=64,
+        tensor_module="torch",
+    )
+    executor.run_circuit(circuit)
+    terms = _symbolic_hamiltonian_to_operator_terms(observable)
+
+    term_value = executor.expectation_operator_sum(terms)
+    mpo_value = executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits))
+
+    np.testing.assert_allclose(mpo_value, term_value, atol=1e-12)
+
+
+def test_vidal_backend_accepts_dense_two_qubit_observable():
+    circuit = Circuit(2)
+    circuit.add(gates.H(0))
+    circuit.add(gates.CNOT(0, 1))
+
+    bell = np.zeros((4, 4), dtype=complex)
+    bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5
+    observable = {"matrix": bell, "qubits": [0, 1]}
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=16,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, 1.0, atol=1e-12)
+
+
+def test_vidal_backend_dense_observable_preserves_complex_value():
+    circuit = Circuit(2)
+    circuit.add(gates.H(0))
+    circuit.add(gates.H(1))
+
+    op = np.zeros((4, 4), dtype=complex)
+    op[0, 3] = 1.0
+    observable = {"coefficient": 1.0j, "matrix": op, "qubits": [0, 1]}
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=16,
+        tensor_module="torch",
+        fallback=False,
+    )
+    value = backend.expectation(circuit, observable, preprocess=False)
+
+    np.testing.assert_allclose(value, 0.25j, atol=1e-12)
+
+
+def test_truncation_error_no_truncation():
+    """With large bond, truncation error should be essentially zero."""
+    circuit = build_local_circuit(nqubits=6, nlayers=2)
+    observable = hamiltonians.SymbolicHamiltonian(form=0.5 * X(0) * Z(1))
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(max_bond_dimension=256, tensor_module="torch")
+    value = backend.expectation(circuit, observable)
+    _ = value  # ensure computation runs
+
+    assert backend.last_truncation_error < 1e-14, (
+        f"Expected near-zero truncation error, got {backend.last_truncation_error}"
+    )
+    assert backend.last_max_truncation_error < 1e-14, (
+        "Expected near-zero max truncation error, got "
+        f"{backend.last_max_truncation_error}"
+    )
+
+
+def test_vidal_backend_matches_statevector_multiterm():
+    """Multi-term observable with non-adjacent gates, compile_circuit=True."""
+    circuit = Circuit(5)
+    for q in range(5):
+        circuit.add(gates.RY(q, theta=0.7))
+        circuit.add(gates.RZ(q, theta=0.3))
+    circuit.add(gates.CNOT(0, 2))
+    circuit.add(gates.CNOT(1, 4))
+
+    observable = hamiltonians.SymbolicHamiltonian(
+        form=(0.3 * X(0) * Z(2) + 0.7 * Y(1) * Y(4) - 0.5 * Z(0) * X(4))
+    )
+
+    exact_state = circuit().state(numpy=True)
+    exact = observable.expectation_from_state(exact_state)
+
+    backend = VidalBackend()
+    backend.configure_tn_simulation(
+        max_bond_dimension=64, tensor_module="torch", compile_circuit=True,
+    )
+    value = backend.expectation(circuit, observable)
+    np.testing.assert_allclose(value, exact, atol=1e-10)
Author	SHA1	Message	Date
jaunatisblue	f93c95b3a1	代码封装 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-18 22:58:57 +08:00
jaunatisblue	eed42dcfa9	修改库 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-17 23:02:14 +08:00
jaunatisblue	28080dff1d	简化代码；加入.venv下内容 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-18 02:47:40 +08:00
jaunatisblue	ef3d7e9ee6	决赛现场脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-18 01:37:19 +08:00
jaunatisblue	4c7a10d026	补充 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-15 11:11:20 +08:00
jaunatisblue	915c24dc7b	赛前稳定版 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-15 09:32:26 +08:00
jaunatisblue	72f95599bb	完善mps的vidal机制,多节点并行;补充tn搜索时dask集群搜索的方式 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-12 15:44:19 +08:00
jaunatisblue	aa122964b4	numpy换为torch；修复torch后端计算方式不支持的问题，加速svd；发现并行化err，尝试修复；当前加速比11x,但是最新的并行方式不稳定，可能有精度问题 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-10 22:33:46 +08:00
jaunatisblue	fea8e5abc0	更新脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-09 21:15:19 +08:00
jaunatisblue	ff96e36cfc	更新脚本和后端 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-09 18:36:23 +08:00
jaunatisblue	7cebbb0820	更新baseline Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-09 15:21:56 +08:00
jaunatisblue	57d5fbcbb0	mps基础脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-09 13:59:39 +08:00
jaunatisblue	5479574502	优化 torch CPU 张量网络收缩路径 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details - torch CPU 收缩默认走 cotengra matmul lowering - 复用 mm/bmm/matmul 输出缓冲区，降低中间张量分配压力 - 仅回收 contiguous tensor，避免非连续 view 进入 workspace - 调整 cotengra 中间节点 index 顺序，减少 reshape 触发 clone/copy - qibotn MPI 分片收缩显式使用 backend=torch - rank 内分片结果先在 torch 中累加，最后再转 numpy 做 Reduce - 统一 quimb 后端 torch 数组转换为 CPU contiguous complex128	2026-05-08 11:57:18 +08:00
jaunatisblue	cec0ba272a	完善脚本功能，添加计时估计	2026-05-08 10:20:03 +08:00
jaunatisblue	8b71ff96c8	误传profiling删除 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-08 00:16:28 +08:00
jaunatisblue	49b27a5840	完善时间剪枝功能 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-08 00:12:32 +08:00
jaunatisblue	c818ac7a6e	mpi完善 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 23:37:15 +08:00
jaunatisblue	0a96553bd8	多机测试 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 23:35:23 +08:00
jaunatisblue	2f5c863952	并行化支持完善 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 23:26:53 +08:00
jaunatisblue	fbae48eb3d	期望值计算支持；更新后端调用 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-07 11:19:45 +08:00
jaunatisblue	f776fbb04f	修复时间剪枝功能 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-05 23:31:23 +08:00
jaunatisblue	5a692033a6	添加MPI并行TN benchmark及辅助脚本，移除旧benchmark Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>	2026-05-05 19:04:09 +08:00
jaunatisblue	a3f39a1d67	删除tn脚本implementation	2026-05-03 19:06:21 +08:00
jaunatisblue	dd222587b7	tn脚本更新 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-05-03 18:54:05 +08:00
jaunatisblue	740828872e	添加tn脚本 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-28 23:07:39 +08:00
jaunatisblue	80d9c1de5a	benchmark测试，发现瓶颈：路径搜索 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 18:59:54 +08:00
jaunatisblue	2c54840e7b	1.完成mps态脚本，与原始qibojit结果比对确定bond demension和cut off值；2.更新了官方库；3.新大陆 Some checks failed Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled Details Tests / check (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled Details Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled Details	2026-04-27 11:03:57 +08:00