Merge pull request #74 from qiboteam/update_tn

Update tn
2025-09-08 10:13:43 +08:00
parent 9ba74b58a0 a5b266ed46
commit 0bfe912844
6 changed files with 854 additions and 371 deletions
--- a/src/qibotn/backends/cutensornet.py
+++ b/src/qibotn/backends/cutensornet.py
@@ -1,21 +1,28 @@
 import numpy as np
+from qibo import hamiltonians
 from qibo.backends import NumpyBackend
 from qibo.config import raise_error
-from qibo.result import QuantumState

 from qibotn.backends.abstract import QibotnBackend
-
-CUDA_TYPES = {}
+from qibotn.result import TensorNetworkResult


 class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
    # CI does not test for GPU
    """Creates CuQuantum backend for QiboTN."""

-    def __init__(self, runcard):
+    def __init__(self, runcard=None):
        super().__init__()
-        from cuquantum import cutensornet as cutn  # pylint: disable=import-error
+        from cuquantum import __version__  # pylint: disable=import-error

+        self.name = "qibotn"
+        self.platform = "cutensornet"
+        self.versions["cuquantum"] = __version__
+        self.supports_multigpu = True
+        self.configure_tn_simulation(runcard)
+
+    def configure_tn_simulation(self, runcard):
+        self.rank = None
        if runcard is not None:
            self.MPI_enabled = runcard.get("MPI_enabled", False)
            self.NCCL_enabled = runcard.get("NCCL_enabled", False)
@@ -23,15 +30,17 @@ class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
            expectation_enabled_value = runcard.get("expectation_enabled")
            if expectation_enabled_value is True:
                self.expectation_enabled = True
-                self.pauli_string_pattern = "XXXZ"
+                self.observable = None
            elif expectation_enabled_value is False:
                self.expectation_enabled = False
            elif isinstance(expectation_enabled_value, dict):
                self.expectation_enabled = True
-                expectation_enabled_dict = runcard.get("expectation_enabled", {})
-                self.pauli_string_pattern = expectation_enabled_dict.get(
-                    "pauli_string_pattern", None
-                )
+                self.observable = runcard.get("expectation_enabled", {})
+            elif isinstance(
+                expectation_enabled_value, hamiltonians.SymbolicHamiltonian
+            ):
+                self.expectation_enabled = True
+                self.observable = expectation_enabled_value
            else:
                raise TypeError("expectation_enabled has an unexpected type")

@@ -59,44 +68,6 @@ class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
            self.NCCL_enabled = False
            self.expectation_enabled = False

-        self.name = "qibotn"
-        self.cuquantum = cuquantum
-        self.cutn = cutn
-        self.platform = "cutensornet"
-        self.versions["cuquantum"] = self.cuquantum.__version__
-        self.supports_multigpu = True
-        self.handle = self.cutn.create()
-
-        global CUDA_TYPES
-        CUDA_TYPES = {
-            "complex64": (
-                self.cuquantum.cudaDataType.CUDA_C_32F,
-                self.cuquantum.ComputeType.COMPUTE_32F,
-            ),
-            "complex128": (
-                self.cuquantum.cudaDataType.CUDA_C_64F,
-                self.cuquantum.ComputeType.COMPUTE_64F,
-            ),
-        }
-
-    def __del__(self):
-        if hasattr(self, "cutn"):
-            self.cutn.destroy(self.handle)
-
-    def cuda_type(self, dtype="complex64"):
-        """Get CUDA Type.
-
-        Parameters:
-            dtype (str, optional): Either single ("complex64") or double (complex128) precision. Defaults to "complex64".
-
-        Returns:
-            CUDA Type: tuple of cuquantum.cudaDataType and cuquantum.ComputeType
-        """
-        if dtype in CUDA_TYPES:
-            return CUDA_TYPES[dtype]
-        else:
-            raise TypeError("Type can be either complex64 or complex128")
-
    def execute_circuit(
        self, circuit, initial_state=None, nshots=None, return_array=False
    ):  # pragma: no cover
@@ -136,8 +107,8 @@ class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
            and self.NCCL_enabled == False
            and self.expectation_enabled == False
        ):
-            state, rank = eval.dense_vector_tn_MPI(circuit, self.dtype, 32)
-            if rank > 0:
+            state, self.rank = eval.dense_vector_tn_MPI(circuit, self.dtype, 32)
+            if self.rank > 0:
                state = np.array(0)
        elif (
            self.MPI_enabled == False
@@ -145,8 +116,8 @@ class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
            and self.NCCL_enabled == True
            and self.expectation_enabled == False
        ):
-            state, rank = eval.dense_vector_tn_nccl(circuit, self.dtype, 32)
-            if rank > 0:
+            state, self.rank = eval.dense_vector_tn_nccl(circuit, self.dtype, 32)
+            if self.rank > 0:
                state = np.array(0)
        elif (
            self.MPI_enabled == False
@@ -154,19 +125,17 @@ class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
            and self.NCCL_enabled == False
            and self.expectation_enabled == True
        ):
-            state = eval.expectation_pauli_tn(
-                circuit, self.dtype, self.pauli_string_pattern
-            )
+            state = eval.expectation_tn(circuit, self.dtype, self.observable)
        elif (
            self.MPI_enabled == True
            and self.MPS_enabled == False
            and self.NCCL_enabled == False
            and self.expectation_enabled == True
        ):
-            state, rank = eval.expectation_pauli_tn_MPI(
-                circuit, self.dtype, self.pauli_string_pattern, 32
+            state, self.rank = eval.expectation_tn_MPI(
+                circuit, self.dtype, self.observable, 32
            )
-            if rank > 0:
+            if self.rank > 0:
                state = np.array(0)
        elif (
            self.MPI_enabled == False
@@ -174,15 +143,27 @@ class CuTensorNet(QibotnBackend, NumpyBackend):  # pragma: no cover
            and self.NCCL_enabled == True
            and self.expectation_enabled == True
        ):
-            state, rank = eval.expectation_pauli_tn_nccl(
-                circuit, self.dtype, self.pauli_string_pattern, 32
+            state, self.rank = eval.expectation_tn_nccl(
+                circuit, self.dtype, self.observable, 32
            )
-            if rank > 0:
+            if self.rank > 0:
                state = np.array(0)
        else:
            raise_error(NotImplementedError, "Compute type not supported.")

-        if return_array:
-            return state.flatten()
+        if self.expectation_enabled:
+            return state.flatten().real
        else:
-            return QuantumState(state.flatten())
+            if return_array:
+                statevector = state.flatten()
+            else:
+                statevector = state
+
+            return TensorNetworkResult(
+                nqubits=circuit.nqubits,
+                backend=self,
+                measures=None,
+                measured_probabilities=None,
+                prob_type=None,
+                statevector=statevector,
+            )
--- a/src/qibotn/circuit_convertor.py
+++ b/src/qibotn/circuit_convertor.py
@@ -195,12 +195,12 @@ class QiboCircuitToEinsum:
            gates.append((operand, (qubit,)))
        return gates

-    def expectation_operands(self, pauli_string):
+    def expectation_operands(self, ham_gates):
        """Create the operands for pauli string expectation computation in the
        interleave format.

        Parameters:
-            pauli_string: A string representating the list of pauli gates.
+            ham_gates: A list of gates derived from Qibo hamiltonian object.

        Returns:
            Operands for the contraction in the interleave format.
@@ -208,8 +208,6 @@ class QiboCircuitToEinsum:
        input_bitstring = "0" * self.circuit.nqubits

        input_operands = self._get_bitstring_tensors(input_bitstring)
-        pauli_string = dict(zip(range(self.circuit.nqubits), pauli_string))
-        pauli_map = pauli_string

        (
            mode_labels,
@@ -228,11 +226,7 @@ class QiboCircuitToEinsum:

        next_frontier = max(qubits_frontier.values()) + 1

-        pauli_gates = self.get_pauli_gates(
-            pauli_map, dtype=self.dtype, backend=self.backend
-        )
-
-        gates_inverse = pauli_gates + self.gate_tensors_inverse
+        gates_inverse = ham_gates + self.gate_tensors_inverse

        (
            gate_mode_labels_inverse,
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -1,45 +1,238 @@
 import cupy as cp
+import cuquantum.cutensornet as cutn
+from cupy.cuda import nccl
 from cupy.cuda.runtime import getDeviceCount
-from cuquantum import contract
+from cuquantum import Network, contract
+from mpi4py import MPI
+from qibo import hamiltonians
+from qibo.symbols import I, X, Y, Z

 from qibotn.circuit_convertor import QiboCircuitToEinsum
 from qibotn.circuit_to_mps import QiboCircuitToMPS
 from qibotn.mps_contraction_helper import MPSContractionHelper


-def dense_vector_tn(qibo_circ, datatype):
-    """Convert qibo circuit to tensornet (TN) format and perform contraction to
-    dense vector.
+def check_observable(observable, circuit_nqubit):
+    """Checks the type of observable and returns the appropriate Hamiltonian."""
+    if observable is None:
+        return build_observable(circuit_nqubit)
+    elif isinstance(observable, dict):
+        return create_hamiltonian_from_dict(observable, circuit_nqubit)
+    elif isinstance(observable, hamiltonians.SymbolicHamiltonian):
+        # TODO: check if the observable is compatible with the circuit
+        return observable
+    else:
+        raise TypeError("Invalid observable type.")

-    Parameters:
-        qibo_circ: The quantum circuit object.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
+
+def build_observable(circuit_nqubit):
+    """Helper function to construct a target observable."""
+    hamiltonian_form = 0
+    for i in range(circuit_nqubit):
+        hamiltonian_form += 0.5 * X(i % circuit_nqubit) * Z((i + 1) % circuit_nqubit)
+
+    hamiltonian = hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
+    return hamiltonian
+
+
+def create_hamiltonian_from_dict(data, circuit_nqubit):
+    """Create a Qibo SymbolicHamiltonian from a dictionary representation.
+
+    Ensures that each Hamiltonian term explicitly acts on all circuit qubits
+    by adding identity (`I`) gates where needed.
+
+    Args:
+        data (dict): Dictionary containing Hamiltonian terms.
+        circuit_nqubit (int): Total number of qubits in the quantum circuit.

    Returns:
-        Dense vector of quantum circuit.
+        hamiltonians.SymbolicHamiltonian: The constructed Hamiltonian.
    """
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    return contract(*myconvertor.state_vector_operands())
+    PAULI_GATES = {"X": X, "Y": Y, "Z": Z}
+
+    terms = []
+
+    for term in data["terms"]:
+        coeff = term["coefficient"]
+        operators = term["operators"]  # List of tuples like [("Z", 0), ("X", 1)]
+
+        # Convert the operator list into a dictionary {qubit_index: gate}
+        operator_dict = {q: PAULI_GATES[g] for g, q in operators}
+
+        # Build the full term ensuring all qubits are covered
+        full_term_expr = [
+            operator_dict[q](q) if q in operator_dict else I(q)
+            for q in range(circuit_nqubit)
+        ]
+
+        # Multiply all operators together to form a single term
+        term_expr = full_term_expr[0]
+        for op in full_term_expr[1:]:
+            term_expr *= op
+
+        # Scale by the coefficient
+        final_term = coeff * term_expr
+        terms.append(final_term)
+
+    if not terms:
+        raise ValueError("No valid Hamiltonian terms were added.")
+
+    # Combine all terms
+    hamiltonian_form = sum(terms)
+
+    return hamiltonians.SymbolicHamiltonian(hamiltonian_form)


-def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
-    """Convert qibo circuit to tensornet (TN) format and perform contraction to
-    expectation of given Pauli string.
+def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
+    """Populate the gates for all pauli operators.

    Parameters:
-        qibo_circ: The quantum circuit object.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
-        pauli_string_pattern(str): pauli string pattern.
+        pauli_map: A dictionary mapping qubits to pauli operators.
+        dtype: Data type for the tensor operands.
+        backend: The package the tensor operands belong to.

    Returns:
-        Expectation of quantum circuit due to pauli string.
+        A sequence of pauli gates.
    """
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    return contract(
-        *myconvertor.expectation_operands(
-            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+    asarray = backend.asarray
+    pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
+    pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
+    pauli_y = asarray([[0, -1j], [1j, 0]], dtype=dtype)
+    pauli_z = asarray([[1, 0], [0, -1]], dtype=dtype)
+
+    operand_map = {"I": pauli_i, "X": pauli_x, "Y": pauli_y, "Z": pauli_z}
+    gates = []
+    for qubit, pauli_char, coeff in pauli_map:
+        operand = operand_map.get(pauli_char)
+        if operand is None:
+            raise ValueError("pauli string character must be one of I/X/Y/Z")
+        operand = coeff * operand
+        gates.append((operand, (qubit,)))
+    return gates
+
+
+def extract_gates_and_qubits(hamiltonian):
+    """
+    Extracts the gates and their corresponding qubits from a Qibo Hamiltonian.
+
+    Parameters:
+        hamiltonian (qibo.hamiltonians.Hamiltonian or qibo.hamiltonians.SymbolicHamiltonian):
+            A Qibo Hamiltonian object.
+
+    Returns:
+        list of tuples: [(coefficient, [(gate, qubit), ...]), ...]
+            - coefficient: The prefactor of the term.
+            - list of (gate, qubit): Each term's gates and the qubits they act on.
+    """
+    extracted_terms = []
+
+    if isinstance(hamiltonian, hamiltonians.SymbolicHamiltonian):
+        for term in hamiltonian.terms:
+            coeff = term.coefficient  # Extract coefficient
+            gate_qubit_list = []
+
+            # Extract gate and qubit information
+            for factor in term.factors:
+                gate_name = str(factor)[
+                    0
+                ]  # Extract the gate type (X, Y, Z) from 'X0', 'Z1'
+                qubit = int(str(factor)[1:])  # Extract the qubit index
+                gate_qubit_list.append((qubit, gate_name, coeff))
+                coeff = 1.0
+
+            extracted_terms.append(gate_qubit_list)
+
+    else:
+        raise ValueError(
+            "Unsupported Hamiltonian type. Must be SymbolicHamiltonian or Hamiltonian."
        )
+
+    return extracted_terms
+
+
+def initialize_mpi():
+    """Initialize MPI communication and device selection."""
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    device_id = rank % getDeviceCount()
+    cp.cuda.Device(device_id).use()
+    return comm, rank, size, device_id
+
+
+def initialize_nccl(comm_mpi, rank, size):
+    """Initialize NCCL communication."""
+    nccl_id = nccl.get_unique_id() if rank == 0 else None
+    nccl_id = comm_mpi.bcast(nccl_id, root=0)
+    return nccl.NcclCommunicator(size, nccl_id, rank)
+
+
+def get_operands(qibo_circ, datatype, rank, comm):
+    """Perform circuit conversion and broadcast operands."""
+    if rank == 0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+        operands = myconvertor.state_vector_operands()
+    else:
+        operands = None
+    return comm.bcast(operands, root=0)
+
+
+def compute_optimal_path(network, n_samples, size, comm):
+    """Compute contraction path and broadcast optimal selection."""
+    path, info = network.contract_path(
+        optimize={
+            "samples": n_samples,
+            "slicing": {
+                "min_slices": max(32, size),
+                "memory_model": cutn.MemoryModel.CUTENSOR,
+            },
+        }
    )
+    opt_cost, sender = comm.allreduce(
+        sendobj=(info.opt_cost, comm.Get_rank()), op=MPI.MINLOC
+    )
+    return comm.bcast(info, sender)
+
+
+def compute_slices(info, rank, size):
+    """Determine the slice range each process should compute."""
+    num_slices = info.num_slices
+    chunk, extra = num_slices // size, num_slices % size
+    slice_begin = rank * chunk + min(rank, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
+    return range(slice_begin, slice_end)
+
+
+def reduce_result(result, comm, method="MPI", root=0):
+    """Reduce results across processes."""
+    if method == "MPI":
+        return comm.reduce(sendobj=result, op=MPI.SUM, root=root)
+
+    elif method == "NCCL":
+        stream_ptr = cp.cuda.get_current_stream().ptr
+        if result.dtype == cp.complex128:
+            count = result.size * 2  # complex128 has 2 float64 numbers
+            nccl_type = nccl.NCCL_FLOAT64
+        elif result.dtype == cp.complex64:
+            count = result.size * 2  # complex64 has 2 float32 numbers
+            nccl_type = nccl.NCCL_FLOAT32
+        else:
+            raise TypeError(f"Unsupported dtype for NCCL reduce: {result.dtype}")
+
+        comm.reduce(
+            result.data.ptr,
+            result.data.ptr,
+            count,
+            nccl_type,
+            nccl.NCCL_SUM,
+            root,
+            stream_ptr,
+        )
+        return result
+    else:
+        raise ValueError(f"Unknown reduce method: {method}")


 def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
@@ -61,60 +254,16 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    Returns:
        Dense vector of quantum circuit.
    """
-
-    from cuquantum import Network
-    from mpi4py import MPI
-
-    root = 0
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    device_id = rank % getDeviceCount()
-
-    # Perform circuit conversion
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-
-    operands = myconvertor.state_vector_operands()
-
-    # Assign the device for each process.
-    device_id = rank % getDeviceCount()
-
-    # Create network object.
+    comm, rank, size, device_id = initialize_mpi()
+    operands = get_operands(qibo_circ, datatype, rank, comm)
    network = Network(*operands, options={"device_id": device_id})
-
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
-    )
-
-    # Select the best path from all ranks.
-    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm.bcast(info, sender)
-
-    # Set path and slices.
+    info = compute_optimal_path(network, n_samples, size, comm)
    path, info = network.contract_path(
        optimize={"path": info.path, "slicing": info.slices}
    )
-
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
-
-    # Contract the group of slices the process is responsible for.
+    slices = compute_slices(info, rank, size)
    result = network.contract(slices=slices)
-
-    # Sum the partial contribution from each process on root.
-    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-
-    return result, rank
+    return reduce_result(result, comm, method="MPI"), rank


 def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
@@ -136,74 +285,35 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    Returns:
        Dense vector of quantum circuit.
    """
-    from cupy.cuda import nccl
-    from cuquantum import Network
-    from mpi4py import MPI
-
-    root = 0
-    comm_mpi = MPI.COMM_WORLD
-    rank = comm_mpi.Get_rank()
-    size = comm_mpi.Get_size()
-
-    device_id = rank % getDeviceCount()
-
-    cp.cuda.Device(device_id).use()
-
-    # Set up the NCCL communicator.
-    nccl_id = nccl.get_unique_id() if rank == root else None
-    nccl_id = comm_mpi.bcast(nccl_id, root)
-    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
-
-    # Perform circuit conversion
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    operands = myconvertor.state_vector_operands()
-
+    comm_mpi, rank, size, device_id = initialize_mpi()
+    comm_nccl = initialize_nccl(comm_mpi, rank, size)
+    operands = get_operands(qibo_circ, datatype, rank, comm_mpi)
    network = Network(*operands)
-
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
-    )
-
-    # Select the best path from all ranks.
-    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm_mpi.bcast(info, sender)
-
-    # Set path and slices.
+    info = compute_optimal_path(network, n_samples, size, comm_mpi)
    path, info = network.contract_path(
        optimize={"path": info.path, "slicing": info.slices}
    )
-
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
-
-    # Contract the group of slices the process is responsible for.
+    slices = compute_slices(info, rank, size)
    result = network.contract(slices=slices)
-
-    # Sum the partial contribution from each process on root.
-    stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(
-        result.data.ptr,
-        result.data.ptr,
-        result.size,
-        nccl.NCCL_FLOAT64,
-        nccl.NCCL_SUM,
-        root,
-        stream_ptr,
-    )
-
-    return result, rank
+    return reduce_result(result, comm_nccl, method="NCCL"), rank


-def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+def dense_vector_tn(qibo_circ, datatype):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to
+    dense vector.
+
+    Parameters:
+        qibo_circ: The quantum circuit object.
+        datatype (str): Either single ("complex64") or double (complex128) precision.
+
+    Returns:
+        Dense vector of quantum circuit.
+    """
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    return contract(*myconvertor.state_vector_operands())
+
+
+def expectation_tn_nccl(qibo_circ, datatype, observable, n_samples=8):
    """Convert qibo circuit to tensornet (TN) format and perform contraction to
    expectation of given Pauli string using multi node and multi GPU through
    NCCL.
@@ -226,76 +336,53 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
    Returns:
        Expectation of quantum circuit due to pauli string.
    """
-    from cupy.cuda import nccl
-    from cuquantum import Network
-    from mpi4py import MPI

-    root = 0
-    comm_mpi = MPI.COMM_WORLD
-    rank = comm_mpi.Get_rank()
-    size = comm_mpi.Get_size()
+    comm_mpi, rank, size, device_id = initialize_mpi()

-    device_id = rank % getDeviceCount()
+    comm_nccl = initialize_nccl(comm_mpi, rank, size)

-    cp.cuda.Device(device_id).use()
+    observable = check_observable(observable, qibo_circ.nqubits)

-    # Set up the NCCL communicator.
-    nccl_id = nccl.get_unique_id() if rank == root else None
-    nccl_id = comm_mpi.bcast(nccl_id, root)
-    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
+    ham_gate_map = extract_gates_and_qubits(observable)

-    # Perform circuit conversion
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
-    )
+    if rank == 0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)

-    network = Network(*operands)
+    exp = 0
+    for each_ham in ham_gate_map:
+        ham_gates = get_ham_gates(each_ham)
+        # Perform circuit conversion
+        if rank == 0:
+            operands = myconvertor.expectation_operands(ham_gates)
+        else:
+            operands = None

-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
-    )
+        operands = comm_mpi.bcast(operands, root=0)

-    # Select the best path from all ranks.
-    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
+        network = Network(*operands)

-    # Broadcast info from the sender to all other ranks.
-    info = comm_mpi.bcast(info, sender)
+        # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
+        info = compute_optimal_path(network, n_samples, size, comm_mpi)

-    # Set path and slices.
-    path, info = network.contract_path(
-        optimize={"path": info.path, "slicing": info.slices}
-    )
+        # Recompute path with the selected optimal settings
+        path, info = network.contract_path(
+            optimize={"path": info.path, "slicing": info.slices}
+        )

-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
+        slices = compute_slices(info, rank, size)

-    # Contract the group of slices the process is responsible for.
-    result = network.contract(slices=slices)
+        # Contract the group of slices the process is responsible for.
+        result = network.contract(slices=slices)

-    # Sum the partial contribution from each process on root.
-    stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(
-        result.data.ptr,
-        result.data.ptr,
-        result.size,
-        nccl.NCCL_FLOAT64,
-        nccl.NCCL_SUM,
-        root,
-        stream_ptr,
-    )
+        # Sum the partial contribution from each process on root.
+        result = reduce_result(result, comm_nccl, method="NCCL", root=0)

-    return result, rank
+        exp += result
+
+    return exp, rank


-def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+def expectation_tn_MPI(qibo_circ, datatype, observable, n_samples=8):
    """Convert qibo circuit to tensornet (TN) format and perform contraction to
    expectation of given Pauli string using multi node and multi GPU through
    MPI.
@@ -318,61 +405,76 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
    Returns:
        Expectation of quantum circuit due to pauli string.
    """
-    from cuquantum import Network
-    from mpi4py import MPI  # this line initializes MPI
+    # Initialize MPI and device
+    comm, rank, size, device_id = initialize_mpi()

-    root = 0
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
+    observable = check_observable(observable, qibo_circ.nqubits)

-    device_id = rank % getDeviceCount()
+    ham_gate_map = extract_gates_and_qubits(observable)

-    # Perform circuit conversion
+    if rank == 0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    exp = 0
+    for each_ham in ham_gate_map:
+        ham_gates = get_ham_gates(each_ham)
+        # Perform circuit conversion
+        # Perform circuit conversion
+        if rank == 0:
+            operands = myconvertor.expectation_operands(ham_gates)
+        else:
+            operands = None
+
+        operands = comm.bcast(operands, root=0)
+
+        # Create network object.
+        network = Network(*operands, options={"device_id": device_id})
+
+        # Compute optimal contraction path
+        info = compute_optimal_path(network, n_samples, size, comm)
+
+        # Set path and slices.
+        path, info = network.contract_path(
+            optimize={"path": info.path, "slicing": info.slices}
+        )
+
+        # Compute slice range for each rank
+        slices = compute_slices(info, rank, size)
+
+        # Perform contraction
+        result = network.contract(slices=slices)
+
+        # Sum the partial contribution from each process on root.
+        result = reduce_result(result, comm, method="MPI", root=0)
+
+        if rank == 0:
+            exp += result
+
+    return exp, rank
+
+
+def expectation_tn(qibo_circ, datatype, observable):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to
+    expectation of given Pauli string.
+
+    Parameters:
+        qibo_circ: The quantum circuit object.
+        datatype (str): Either single ("complex64") or double (complex128) precision.
+        pauli_string_pattern(str): pauli string pattern.
+
+    Returns:
+        Expectation of quantum circuit due to pauli string.
+    """
    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)

-    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
-    )
+    observable = check_observable(observable, qibo_circ.nqubits)

-    # Assign the device for each process.
-    device_id = rank % getDeviceCount()
-
-    # Create network object.
-    network = Network(*operands, options={"device_id": device_id})
-
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
-    )
-
-    # Select the best path from all ranks.
-    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm.bcast(info, sender)
-
-    # Set path and slices.
-    path, info = network.contract_path(
-        optimize={"path": info.path, "slicing": info.slices}
-    )
-
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
-
-    # Contract the group of slices the process is responsible for.
-    result = network.contract(slices=slices)
-
-    # Sum the partial contribution from each process on root.
-    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-
-    return result, rank
+    ham_gate_map = extract_gates_and_qubits(observable)
+    exp = 0
+    for each_ham in ham_gate_map:
+        ham_gates = get_ham_gates(each_ham)
+        expectation_operands = myconvertor.expectation_operands(ham_gates)
+        exp += contract(*expectation_operands)
+    return exp


 def dense_vector_mps(qibo_circ, gate_algo, datatype):
@@ -393,27 +495,3 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
    return mps_helper.contract_state_vector(
        myconvertor.mps_tensors, {"handle": myconvertor.handle}
    )
-
-
-def pauli_string_gen(nqubits, pauli_string_pattern):
-    """Used internally to generate the string based on given pattern and number
-    of qubit.
-
-    Parameters:
-        nqubits(int): Number of qubits of Quantum Circuit
-        pauli_string_pattern(str): Strings representing sequence of pauli gates.
-
-    Returns:
-        String representation of the actual pauli string from the pattern.
-
-    Example: pattern: "XZ", number of qubit: 7, output = XZXZXZX
-    """
-    if nqubits <= 0:
-        return "Invalid input. N should be a positive integer."
-
-    result = ""
-
-    for i in range(nqubits):
-        char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
-        result += char_to_add
-    return result
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -9,16 +9,16 @@ import pytest

 # backends to be tested
 # TODO: add cutensornet and quimb here as well
-BACKENDS = ["qmatchatea"]
+BACKENDS = ["cutensornet"]
+# BACKENDS = ["qmatchatea"]


 def get_backend(backend_name):

+    from qibotn.backends.cutensornet import CuTensorNet
    from qibotn.backends.qmatchatea import QMatchaTeaBackend

-    NAME2BACKEND = {
-        "qmatchatea": QMatchaTeaBackend,
-    }
+    NAME2BACKEND = {"qmatchatea": QMatchaTeaBackend, "cutensornet": CuTensorNet}

    return NAME2BACKEND[backend_name]()

--- a/tests/test_cuquantum_cutensor_backend.py
+++ b/tests/test_cuquantum_cutensor_backend.py
@@ -1,11 +1,13 @@
-from timeit import default_timer as timer
+import math

-import config
 import cupy as cp
-import numpy as np
 import pytest
 import qibo
+from qibo import construct_backend, hamiltonians
 from qibo.models import QFT
+from qibo.symbols import X, Z
+
+ABS_TOL = 1e-7


 def qibo_qft(nqubits, swaps):
@@ -14,37 +16,73 @@ def qibo_qft(nqubits, swaps):
    return circ_qibo, state_vec


-def time(func):
-    start = timer()
-    res = func()
-    end = timer()
-    time = end - start
-    return time, res
+def build_observable(nqubits):
+    """Helper function to construct a target observable."""
+    hamiltonian_form = 0
+    for i in range(nqubits):
+        hamiltonian_form += 0.5 * X(i % nqubits) * Z((i + 1) % nqubits)
+
+    hamiltonian = hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
+    return hamiltonian, hamiltonian_form
+
+
+def build_observable_dict(nqubits):
+    """Construct a target observable as a dictionary representation.
+
+    Returns a dictionary suitable for `create_hamiltonian_from_dict`.
+    """
+    terms = []
+
+    for i in range(nqubits):
+        term = {
+            "coefficient": 0.5,
+            "operators": [("X", i % nqubits), ("Z", (i + 1) % nqubits)],
+        }
+        terms.append(term)
+
+    return {"terms": terms}


@pytest.mark.gpu
@pytest.mark.parametrize("nqubits", [1, 2, 5, 10])
 def test_eval(nqubits: int, dtype="complex128"):
-    """Evaluate QASM with cuQuantum.
-
+    """
    Args:
        nqubits (int): Total number of qubits in the system.
        dtype (str): The data type for precision, 'complex64' for single,
            'complex128' for double.
    """
-    import qibotn.eval
-
    # Test qibo
-    qibo.set_backend(backend=config.qibo.backend, platform=config.qibo.platform)
-    qibo_time, (qibo_circ, result_sv) = time(lambda: qibo_qft(nqubits, swaps=True))
+    qibo.set_backend(backend="numpy")
+    qibo_circ, result_sv = qibo_qft(nqubits, swaps=True)
+    result_sv_cp = cp.asarray(result_sv)

-    # Test Cuquantum
-    cutn_time, result_tn = time(
-        lambda: qibotn.eval.dense_vector_tn(qibo_circ, dtype).flatten()
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")
+    # Test with no settings specified. Default is dense vector calculation without MPI or NCCL.
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    print(
+        f"State vector difference: {abs(result_tn.statevector.flatten() - result_sv_cp).max():0.3e}"
    )
+    assert cp.allclose(
+        result_sv_cp, result_tn.statevector.flatten()
+    ), "Resulting dense vectors do not match"

-    assert 1e-2 * qibo_time < cutn_time < 1e2 * qibo_time
-    assert np.allclose(result_sv, result_tn), "Resulting dense vectors do not match"
+    # Test with explicit settings specified.
+    comp_set_w_bool = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": False,
+    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    print(
+        f"State vector difference: {abs(result_tn.statevector.flatten() - result_sv_cp).max():0.3e}"
+    )
+    assert cp.allclose(
+        result_sv_cp, result_tn.statevector.flatten()
+    ), "Resulting dense vectors do not match"


@pytest.mark.gpu
@@ -57,28 +95,105 @@ def test_mps(nqubits: int, dtype="complex128"):
        dtype (str): The data type for precision, 'complex64' for single,
            'complex128' for double.
    """
-    import qibotn.eval

    # Test qibo
-    qibo.set_backend(backend=config.qibo.backend, platform=config.qibo.platform)
-
-    qibo_time, (circ_qibo, result_sv) = time(lambda: qibo_qft(nqubits, swaps=True))
-
+    qibo.set_backend(backend="numpy")
+    qibo_circ, result_sv = qibo_qft(nqubits, swaps=True)
    result_sv_cp = cp.asarray(result_sv)

-    # Test of MPS
-    gate_algo = {
-        "qr_method": False,
-        "svd_method": {
-            "partition": "UV",
-            "abs_cutoff": 1e-12,
-        },
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")
+    # Test with simple MPS settings specified using bool. Uses the default MPS parameters.
+    comp_set_w_bool = {
+        "MPI_enabled": False,
+        "MPS_enabled": True,
+        "NCCL_enabled": False,
+        "expectation_enabled": False,
    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    print(
+        f"State vector difference: {abs(result_tn.statevector.flatten() - result_sv_cp).max():0.3e}"
+    )
+    assert cp.allclose(
+        result_tn.statevector.flatten(), result_sv_cp
+    ), "Resulting dense vectors do not match"

-    cutn_time, result_tn = time(
-        lambda: qibotn.eval.dense_vector_mps(circ_qibo, gate_algo, dtype).flatten()
+    # Test with explicit MPS computation settings specified using Dict. Users able to specify parameters like qr_method etc.
+    comp_set_w_MPS_config_para = {
+        "MPI_enabled": False,
+        "MPS_enabled": {
+            "qr_method": False,
+            "svd_method": {
+                "partition": "UV",
+                "abs_cutoff": 1e-12,
+            },
+        },
+        "NCCL_enabled": False,
+        "expectation_enabled": False,
+    }
+    backend.configure_tn_simulation(comp_set_w_MPS_config_para)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    print(
+        f"State vector difference: {abs(result_tn.statevector.flatten() - result_sv_cp).max():0.3e}"
+    )
+    assert cp.allclose(
+        result_tn.statevector.flatten(), result_sv_cp
+    ), "Resulting dense vectors do not match"
+
+
+@pytest.mark.parametrize("nqubits", [2, 5, 10])
+def test_expectation(nqubits: int, dtype="complex128"):
+
+    # Test qibo
+    qibo_circ, state_vec_qibo = qibo_qft(nqubits, swaps=True)
+    ham, ham_form = build_observable(nqubits)
+    numpy_backend = construct_backend("numpy")
+    exact_expval = numpy_backend.calculate_expectation_state(
+        hamiltonian=ham,
+        state=state_vec_qibo,
+        normalize=False,
    )

-    print(f"State vector difference: {abs(result_tn - result_sv_cp).max():0.3e}")
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")

-    assert cp.allclose(result_tn, result_sv_cp)
+    # Test with simple settings using bool. Uses default Hamilitonian for expectation calculation.
+    comp_set_w_bool = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": True,
+    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    assert math.isclose(
+        exact_expval.item(), result_tn.real.get().item(), abs_tol=ABS_TOL
+    )
+
+    # Test with user defined hamiltonian using "hamiltonians.SymbolicHamiltonian" object.
+    comp_set_w_hamiltonian_obj = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": ham,
+    }
+    backend.configure_tn_simulation(comp_set_w_hamiltonian_obj)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    assert math.isclose(
+        exact_expval.item(), result_tn.real.get().item(), abs_tol=ABS_TOL
+    )
+
+    # Test with user defined hamiltonian using Dictionary object form of hamiltonian.
+    ham_dict = build_observable_dict(nqubits)
+    comp_set_w_hamiltonian_dict = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": ham_dict,
+    }
+    backend.configure_tn_simulation(comp_set_w_hamiltonian_dict)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    assert math.isclose(
+        exact_expval.item(), result_tn.real.get().item(), abs_tol=ABS_TOL
+    )
--- a/tests/test_cuquantum_cutensor_mpi_backend.py
+++ b/tests/test_cuquantum_cutensor_mpi_backend.py
@@ -0,0 +1,315 @@
+# mpirun --allow-run-as-root -np 2 python -m pytest --with-mpi test_cuquantum_cutensor_mpi_backend.py
+
+import math
+
+import cupy as cp
+import numpy as np
+import pytest
+import qibo
+from qibo import construct_backend, hamiltonians
+from qibo.models import QFT
+from qibo.symbols import X, Z
+
+ABS_TOL = 1e-7
+
+
+def qibo_qft(nqubits, swaps):
+    circ_qibo = QFT(nqubits, swaps)
+    state_vec = circ_qibo().state(numpy=True)
+    return circ_qibo, state_vec
+
+
+def build_observable(nqubits):
+    """Helper function to construct a target observable."""
+    hamiltonian_form = 0
+    for i in range(nqubits):
+        hamiltonian_form += 0.5 * X(i % nqubits) * Z((i + 1) % nqubits)
+
+    hamiltonian = hamiltonians.SymbolicHamiltonian(form=hamiltonian_form)
+    return hamiltonian, hamiltonian_form
+
+
+def build_observable_dict(nqubits):
+    """Construct a target observable as a dictionary representation.
+
+    Returns a dictionary suitable for `create_hamiltonian_from_dict`.
+    """
+    terms = []
+
+    for i in range(nqubits):
+        term = {
+            "coefficient": 0.5,
+            "operators": [("X", i % nqubits), ("Z", (i + 1) % nqubits)],
+        }
+        terms.append(term)
+
+    return {"terms": terms}
+
+
+@pytest.mark.gpu
+@pytest.mark.mpi
+@pytest.mark.parametrize("nqubits", [1, 2, 5, 7, 10])
+def test_eval_mpi(nqubits: int, dtype="complex128"):
+    """
+    Args:
+        nqubits (int): Total number of qubits in the system.
+        dtype (str): The data type for precision, 'complex64' for single,
+            'complex128' for double.
+    """
+    # Test qibo
+    qibo.set_backend(backend="numpy")
+    qibo_circ, result_sv = qibo_qft(nqubits, swaps=True)
+    result_sv_cp = cp.asarray(result_sv)
+
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")
+
+    # Test with explicit settings specified.
+    comp_set_w_bool = {
+        "MPI_enabled": True,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": False,
+    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    result_tn_cp = cp.asarray(result_tn.statevector.flatten())
+
+    print(f"State vector difference: {abs(result_tn_cp - result_sv_cp).max():0.3e}")
+
+    if backend.rank == 0:
+
+        assert cp.allclose(
+            result_sv_cp, result_tn_cp
+        ), "Resulting dense vectors do not match"
+    else:
+        assert (
+            isinstance(result_tn_cp, cp.ndarray)
+            and result_tn_cp.size == 1
+            and result_tn_cp.item() == 0
+        ), f"Rank {backend.rank}: result_tn_cp should be scalar/array with 0, got {result_tn_cp}"
+
+
+@pytest.mark.gpu
+@pytest.mark.mpi
+@pytest.mark.parametrize("nqubits", [1, 2, 5, 7, 10])
+def test_expectation_mpi(nqubits: int, dtype="complex128"):
+
+    # Test qibo
+    qibo_circ, state_vec_qibo = qibo_qft(nqubits, swaps=True)
+    ham, ham_form = build_observable(nqubits)
+    numpy_backend = construct_backend("numpy")
+    exact_expval = numpy_backend.calculate_expectation_state(
+        hamiltonian=ham,
+        state=state_vec_qibo,
+        normalize=False,
+    )
+
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")
+
+    # Test with simple settings using bool. Uses default Hamilitonian for expectation calculation.
+    comp_set_w_bool = {
+        "MPI_enabled": True,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": True,
+    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    if backend.rank == 0:
+        # Compare numerical values
+        assert math.isclose(
+            exact_expval.item(), float(result_tn[0]), abs_tol=ABS_TOL
+        ), f"Rank {backend.rank}: mismatch, expected {exact_expval}, got {result_tn}"
+
+    else:
+        # Rank > 0: must be hardcoded [0] (int)
+        assert (
+            isinstance(result_tn, (np.ndarray, cp.ndarray))
+            and result_tn.size == 1
+            and np.issubdtype(result_tn.dtype, np.integer)
+            and result_tn.item() == 0
+        ), f"Rank {backend.rank}: expected int array [0], got {result_tn}"
+
+    # Test with user defined hamiltonian using "hamiltonians.SymbolicHamiltonian" object.
+    comp_set_w_hamiltonian_obj = {
+        "MPI_enabled": True,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": ham,
+    }
+    backend.configure_tn_simulation(comp_set_w_hamiltonian_obj)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    if backend.rank == 0:
+        # Compare numerical values
+        assert math.isclose(
+            exact_expval.item(), float(result_tn[0]), abs_tol=ABS_TOL
+        ), f"Rank {backend.rank}: mismatch, expected {exact_expval}, got {result_tn}"
+
+    else:
+        # Rank > 0: must be hardcoded [0] (int)
+        assert (
+            isinstance(result_tn, (np.ndarray, cp.ndarray))
+            and result_tn.size == 1
+            and np.issubdtype(result_tn.dtype, np.integer)
+            and result_tn.item() == 0
+        ), f"Rank {backend.rank}: expected int array [0], got {result_tn}"
+
+    # Test with user defined hamiltonian using Dictionary object form of hamiltonian.
+    ham_dict = build_observable_dict(nqubits)
+    comp_set_w_hamiltonian_dict = {
+        "MPI_enabled": True,
+        "MPS_enabled": False,
+        "NCCL_enabled": False,
+        "expectation_enabled": ham_dict,
+    }
+    backend.configure_tn_simulation(comp_set_w_hamiltonian_dict)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    if backend.rank == 0:
+        # Compare numerical values
+        assert math.isclose(
+            exact_expval.item(), float(result_tn[0]), abs_tol=ABS_TOL
+        ), f"Rank {backend.rank}: mismatch, expected {exact_expval}, got {result_tn}"
+
+    else:
+        # Rank > 0: must be hardcoded [0] (int)
+        assert (
+            isinstance(result_tn, (np.ndarray, cp.ndarray))
+            and result_tn.size == 1
+            and np.issubdtype(result_tn.dtype, np.integer)
+            and result_tn.item() == 0
+        ), f"Rank {backend.rank}: expected int array [0], got {result_tn}"
+
+
+@pytest.mark.gpu
+@pytest.mark.mpi
+@pytest.mark.parametrize("nqubits", [1, 2, 5, 7, 10])
+def test_eval_nccl(nqubits: int, dtype="complex128"):
+    """
+    Args:
+        nqubits (int): Total number of qubits in the system.
+        dtype (str): The data type for precision, 'complex64' for single,
+            'complex128' for double.
+    """
+    # Test qibo
+    qibo.set_backend(backend="numpy")
+    qibo_circ, result_sv = qibo_qft(nqubits, swaps=True)
+    result_sv_cp = cp.asarray(result_sv)
+
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")
+
+    # Test with explicit settings specified.
+    comp_set_w_bool = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": True,
+        "expectation_enabled": False,
+    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    result_tn_cp = cp.asarray(result_tn.statevector.flatten())
+
+    if backend.rank == 0:
+        assert cp.allclose(
+            result_sv_cp, result_tn_cp
+        ), "Resulting dense vectors do not match"
+    else:
+        assert (
+            isinstance(result_tn_cp, cp.ndarray)
+            and result_tn_cp.size == 1
+            and result_tn_cp.item() == 0
+        ), f"Rank {backend.rank}: result_tn_cp should be scalar/array with 0, got {result_tn_cp}"
+
+
+@pytest.mark.gpu
+@pytest.mark.mpi
+@pytest.mark.parametrize("nqubits", [1, 2, 5, 7, 10])
+def test_expectation_NCCL(nqubits: int, dtype="complex128"):
+
+    # Test qibo
+    qibo_circ, state_vec_qibo = qibo_qft(nqubits, swaps=True)
+    ham, ham_form = build_observable(nqubits)
+    numpy_backend = construct_backend("numpy")
+    exact_expval = numpy_backend.calculate_expectation_state(
+        hamiltonian=ham,
+        state=state_vec_qibo,
+        normalize=False,
+    )
+
+    # Test cutensornet
+    backend = construct_backend(backend="qibotn", platform="cutensornet")
+
+    # Test with simple settings using bool. Uses default Hamilitonian for expectation calculation.
+    comp_set_w_bool = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": True,
+        "expectation_enabled": True,
+    }
+    backend.configure_tn_simulation(comp_set_w_bool)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    if backend.rank == 0:
+        # Compare numerical values
+        assert math.isclose(
+            exact_expval.item(), float(result_tn[0]), abs_tol=ABS_TOL
+        ), f"Rank {backend.rank}: mismatch, expected {exact_expval}, got {result_tn}"
+
+    else:
+        # Rank > 0: must be hardcoded [0] (int)
+        assert (
+            isinstance(result_tn, (np.ndarray, cp.ndarray))
+            and result_tn.size == 1
+            and np.issubdtype(result_tn.dtype, np.integer)
+            and result_tn.item() == 0
+        ), f"Rank {backend.rank}: expected int array [0], got {result_tn}"
+
+    # Test with user defined hamiltonian using "hamiltonians.SymbolicHamiltonian" object.
+    comp_set_w_hamiltonian_obj = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": True,
+        "expectation_enabled": ham,
+    }
+    backend.configure_tn_simulation(comp_set_w_hamiltonian_obj)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    if backend.rank == 0:
+        # Compare numerical values
+        assert math.isclose(
+            exact_expval.item(), float(result_tn[0]), abs_tol=ABS_TOL
+        ), f"Rank {backend.rank}: mismatch, expected {exact_expval}, got {result_tn}"
+
+    else:
+        # Rank > 0: must be hardcoded [0] (int)
+        assert (
+            isinstance(result_tn, (np.ndarray, cp.ndarray))
+            and result_tn.size == 1
+            and np.issubdtype(result_tn.dtype, np.integer)
+            and result_tn.item() == 0
+        ), f"Rank {backend.rank}: expected int array [0], got {result_tn}"
+
+    # Test with user defined hamiltonian using Dictionary object form of hamiltonian.
+    ham_dict = build_observable_dict(nqubits)
+    comp_set_w_hamiltonian_dict = {
+        "MPI_enabled": False,
+        "MPS_enabled": False,
+        "NCCL_enabled": True,
+        "expectation_enabled": ham_dict,
+    }
+    backend.configure_tn_simulation(comp_set_w_hamiltonian_dict)
+    result_tn = backend.execute_circuit(circuit=qibo_circ)
+    if backend.rank == 0:
+        # Compare numerical values
+        assert math.isclose(
+            exact_expval.item(), float(result_tn[0]), abs_tol=ABS_TOL
+        ), f"Rank {backend.rank}: mismatch, expected {exact_expval}, got {result_tn}"
+
+    else:
+        # Rank > 0: must be hardcoded [0] (int)
+        assert (
+            isinstance(result_tn, (np.ndarray, cp.ndarray))
+            and result_tn.size == 1
+            and np.issubdtype(result_tn.dtype, np.integer)
+            and result_tn.item() == 0
+        ), f"Rank {backend.rank}: expected int array [0], got {result_tn}"