From ac396a35dbf2f1b31a0f8eef64ebf6a07e4e5dcd Mon Sep 17 00:00:00 2001
From: tankya2 <tankya2@ihpc.a-star.edu.sg>
Date: Tue, 18 Feb 2025 11:34:55 +0800
Subject: [PATCH] Refactor to reduce repeating codes

---
 src/qibotn/eval.py | 390 ++++++++++++++++-----------------------------
 1 file changed, 139 insertions(+), 251 deletions(-)

diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py
index ebef103..618bdf7 100644
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -6,40 +6,88 @@ from qibotn.circuit_convertor import QiboCircuitToEinsum
 from qibotn.circuit_to_mps import QiboCircuitToMPS
 from qibotn.mps_contraction_helper import MPSContractionHelper
 
-
-def dense_vector_tn(qibo_circ, datatype):
-    """Convert qibo circuit to tensornet (TN) format and perform contraction to
-    dense vector.
-
-    Parameters:
-        qibo_circ: The quantum circuit object.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
-
-    Returns:
-        Dense vector of quantum circuit.
-    """
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    return contract(*myconvertor.state_vector_operands())
+import cuquantum.cutensornet as cutn
+from cuquantum import Network
+from mpi4py import MPI
+from cupy.cuda import nccl
 
 
-def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
-    """Convert qibo circuit to tensornet (TN) format and perform contraction to
-    expectation of given Pauli string.
+def initialize_mpi():
+    """Initialize MPI communication and device selection."""
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    device_id = rank % getDeviceCount()
+    cp.cuda.Device(device_id).use()
+    return comm, rank, size, device_id
 
-    Parameters:
-        qibo_circ: The quantum circuit object.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
-        pauli_string_pattern(str): pauli string pattern.
 
-    Returns:
-        Expectation of quantum circuit due to pauli string.
-    """
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    return contract(
-        *myconvertor.expectation_operands(
-            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
-        )
+def initialize_nccl(comm_mpi, rank, size):
+    """Initialize NCCL communication."""
+    nccl_id = nccl.get_unique_id() if rank == 0 else None
+    nccl_id = comm_mpi.bcast(nccl_id, root=0)
+    return nccl.NcclCommunicator(size, nccl_id, rank)
+
+
+def get_operands(qibo_circ, datatype, rank, comm):
+    """Perform circuit conversion and broadcast operands."""
+    if rank == 0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+        operands = myconvertor.state_vector_operands()
+    else:
+        operands = None
+    return comm.bcast(operands, root=0)
+
+
+def compute_optimal_path(network, n_samples, size, comm):
+    """Compute contraction path and broadcast optimal selection."""
+    path, info = network.contract_path(
+        optimize={
+            "samples": n_samples,
+            "slicing": {
+                "min_slices": max(32, size),
+                "memory_model": cutn.MemoryModel.CUTENSOR,
+            },
+        }
     )
+    opt_cost, sender = comm.allreduce(
+        sendobj=(info.opt_cost, comm.Get_rank()), op=MPI.MINLOC
+    )
+    return comm.bcast(info, sender)
+
+
+def compute_contraction(network, slices):
+    """Perform tensor contraction."""
+    return network.contract(slices=slices)
+
+
+def compute_slices(info, rank, size):
+    """Determine the slice range each process should compute."""
+    num_slices = info.num_slices
+    chunk, extra = num_slices // size, num_slices % size
+    slice_begin = rank * chunk + min(rank, extra)
+    slice_end = (
+        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
+    )
+    return range(slice_begin, slice_end)
+
+
+def reduce_result(result, comm, method="MPI", root=0):
+    """Reduce results across processes."""
+    if method == "MPI":
+        return comm.reduce(sendobj=result, op=MPI.SUM, root=root)
+    elif method == "NCCL":
+        stream_ptr = cp.cuda.get_current_stream().ptr
+        comm.reduce(
+            result.data.ptr,
+            result.data.ptr,
+            result.size,
+            nccl.NCCL_FLOAT64,
+            nccl.NCCL_SUM,
+            root,
+            stream_ptr,
+        )
+        return result
 
 
 def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
@@ -61,70 +109,16 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
     Returns:
         Dense vector of quantum circuit.
     """
-
-    import cuquantum.cutensornet as cutn
-    from cuquantum import Network
-    from mpi4py import MPI
-
-    root = 0
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    device_id = rank % getDeviceCount()
-    cp.cuda.Device(device_id).use()
-
-    # Perform circuit conversion
-    if rank == 0:
-        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-
-        operands = myconvertor.state_vector_operands()
-    else:
-        operands = None
-
-    operands = comm.bcast(operands, root)
-
-    # Create network object.
+    comm, rank, size, device_id = initialize_mpi()
+    operands = get_operands(qibo_circ, datatype, rank, comm)
     network = Network(*operands, options={"device_id": device_id})
-
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={
-            "samples": n_samples,
-            "slicing": {
-                "min_slices": max(32, size),
-                "memory_model": cutn.MemoryModel.CUTENSOR,
-            },
-        }
-    )
-
-    # Select the best path from all ranks.
-    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm.bcast(info, sender)
-
-    # Set path and slices.
+    info = compute_optimal_path(network, n_samples, size, comm)
     path, info = network.contract_path(
         optimize={"path": info.path, "slicing": info.slices}
     )
-
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
-
-    # Contract the group of slices the process is responsible for.
-    result = network.contract(slices=slices)
-
-    # Sum the partial contribution from each process on root.
-    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-
-    return result, rank
+    slices = compute_slices(info, rank, size)
+    result = compute_contraction(network, slices)
+    return reduce_result(result, comm, method="MPI"), rank
 
 
 def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
@@ -146,83 +140,32 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
     Returns:
         Dense vector of quantum circuit.
     """
-    import cuquantum.cutensornet as cutn
-    from cupy.cuda import nccl
-    from cuquantum import Network
-    from mpi4py import MPI
-
-    root = 0
-    comm_mpi = MPI.COMM_WORLD
-    rank = comm_mpi.Get_rank()
-    size = comm_mpi.Get_size()
-
-    device_id = rank % getDeviceCount()
-
-    cp.cuda.Device(device_id).use()
-
-    # Set up the NCCL communicator.
-    nccl_id = nccl.get_unique_id() if rank == root else None
-    nccl_id = comm_mpi.bcast(nccl_id, root)
-    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
-
-    # Perform circuit conversion
-    if rank == 0:
-        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-        operands = myconvertor.state_vector_operands()
-    else:
-        operands = None
-
-    operands = comm_mpi.bcast(operands, root)
-
+    comm_mpi, rank, size, device_id = initialize_mpi()
+    comm_nccl = initialize_nccl(comm_mpi, rank, size)
+    operands = get_operands(qibo_circ, datatype, rank, comm_mpi)
     network = Network(*operands)
-
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={
-            "samples": n_samples,
-            "slicing": {
-                "min_slices": max(32, size),
-                "memory_model": cutn.MemoryModel.CUTENSOR,
-            },
-        }
-    )
-
-    # Select the best path from all ranks.
-    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm_mpi.bcast(info, sender)
-
-    # Set path and slices.
+    info = compute_optimal_path(network, n_samples, size, comm_mpi)
     path, info = network.contract_path(
         optimize={"path": info.path, "slicing": info.slices}
     )
+    slices = compute_slices(info, rank, size)
+    result = compute_contraction(network, slices)
+    return reduce_result(result, comm_nccl, method="NCCL"), rank
 
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
 
-    # Contract the group of slices the process is responsible for.
-    result = network.contract(slices=slices)
+def dense_vector_tn(qibo_circ, datatype):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to
+    dense vector.
 
-    # Sum the partial contribution from each process on root.
-    stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(
-        result.data.ptr,
-        result.data.ptr,
-        result.size,
-        nccl.NCCL_FLOAT64,
-        nccl.NCCL_SUM,
-        root,
-        stream_ptr,
-    )
+    Parameters:
+        qibo_circ: The quantum circuit object.
+        datatype (str): Either single ("complex64") or double (complex128) precision.
 
-    return result, rank
+    Returns:
+        Dense vector of quantum circuit.
+    """
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    return contract(*myconvertor.state_vector_operands())
 
 
 def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
@@ -248,28 +191,13 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
     Returns:
         Expectation of quantum circuit due to pauli string.
     """
-    import cuquantum.cutensornet as cutn
-    from cupy.cuda import nccl
-    from cuquantum import Network
-    from mpi4py import MPI
 
-    root = 0
-    comm_mpi = MPI.COMM_WORLD
-    rank = comm_mpi.Get_rank()
-    size = comm_mpi.Get_size()
+    comm_mpi, rank, size, device_id = initialize_mpi()
 
-    device_id = rank % getDeviceCount()
-
-    cp.cuda.Device(device_id).use()
-
-    # Set up the NCCL communicator.
-    nccl_id = nccl.get_unique_id() if rank == root else None
-    nccl_id = comm_mpi.bcast(nccl_id, root)
-    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
+    comm_nccl = initialize_nccl(comm_mpi, rank, size)
 
     # Perform circuit conversion
     if rank == 0:
-
         myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
         operands = myconvertor.expectation_operands(
             pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
@@ -277,55 +205,25 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
     else:
         operands = None
 
-    operands = comm_mpi.bcast(operands, root)
+    operands = comm_mpi.bcast(operands, root=0)
 
     network = Network(*operands)
 
     # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={
-            "samples": n_samples,
-            "slicing": {
-                "min_slices": max(32, size),
-                "memory_model": cutn.MemoryModel.CUTENSOR,
-            },
-        }
-    )
+    info = compute_optimal_path(network, n_samples, size, comm_mpi)
 
-    # Select the best path from all ranks.
-    opt_cost, sender = comm_mpi.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm_mpi.bcast(info, sender)
-
-    # Set path and slices.
+    # Recompute path with the selected optimal settings
     path, info = network.contract_path(
         optimize={"path": info.path, "slicing": info.slices}
     )
 
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
+    slices = compute_slices(info, rank, size)
 
     # Contract the group of slices the process is responsible for.
-    result = network.contract(slices=slices)
+    result = compute_contraction(network, slices)
 
     # Sum the partial contribution from each process on root.
-    stream_ptr = cp.cuda.get_current_stream().ptr
-    comm_nccl.reduce(
-        result.data.ptr,
-        result.data.ptr,
-        result.size,
-        nccl.NCCL_FLOAT64,
-        nccl.NCCL_SUM,
-        root,
-        stream_ptr,
-    )
+    result = reduce_result(result, comm_nccl, method="NCCL", root=0)
 
     return result, rank
 
@@ -353,18 +251,8 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
     Returns:
         Expectation of quantum circuit due to pauli string.
     """
-    import cuquantum.cutensornet as cutn
-    from cuquantum import Network
-    from mpi4py import MPI  # this line initializes MPI
-
-    root = 0
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-
-    # Assign the device for each process.
-    device_id = rank % getDeviceCount()
-    cp.cuda.Device(device_id).use()
+    # Initialize MPI and device
+    comm, rank, size, device_id = initialize_mpi()
 
     # Perform circuit conversion
     if rank == 0:
@@ -376,51 +264,51 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
     else:
         operands = None
 
-    operands = comm.bcast(operands, root)
+    operands = comm.bcast(operands, root=0)
 
     # Create network object.
     network = Network(*operands, options={"device_id": device_id})
 
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={
-            "samples": n_samples,
-            "slicing": {
-                "min_slices": max(32, size),
-                "memory_model": cutn.MemoryModel.CUTENSOR,
-            },
-        }
-    )
-
-    # Select the best path from all ranks.
-    opt_cost, sender = comm.allreduce(sendobj=(info.opt_cost, rank), op=MPI.MINLOC)
-
-    # Broadcast info from the sender to all other ranks.
-    info = comm.bcast(info, sender)
+    # Compute optimal contraction path
+    info = compute_optimal_path(network, n_samples, size, comm)
 
     # Set path and slices.
     path, info = network.contract_path(
         optimize={"path": info.path, "slicing": info.slices}
     )
 
-    # Calculate this process's share of the slices.
-    num_slices = info.num_slices
-    chunk, extra = num_slices // size, num_slices % size
-    slice_begin = rank * chunk + min(rank, extra)
-    slice_end = (
-        num_slices if rank == size - 1 else (rank + 1) * chunk + min(rank + 1, extra)
-    )
-    slices = range(slice_begin, slice_end)
+    # Compute slice range for each rank
+    slices = compute_slices(info, rank, size)
 
-    # Contract the group of slices the process is responsible for.
-    result = network.contract(slices=slices)
+    # Perform contraction
+    result = compute_contraction(network, slices)
 
     # Sum the partial contribution from each process on root.
-    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
+    result = reduce_result(result, comm, method="MPI", root=0)
 
     return result, rank
 
 
+def expectation_pauli_tn(qibo_circ, datatype, pauli_string_pattern):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to
+    expectation of given Pauli string.
+
+    Parameters:
+        qibo_circ: The quantum circuit object.
+        datatype (str): Either single ("complex64") or double (complex128) precision.
+        pauli_string_pattern(str): pauli string pattern.
+
+    Returns:
+        Expectation of quantum circuit due to pauli string.
+    """
+    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    return contract(
+        *myconvertor.expectation_operands(
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+        )
+    )
+
+
 def dense_vector_mps(qibo_circ, gate_algo, datatype):
     """Convert qibo circuit to matrix product state (MPS) format and perform
     contraction to dense vector.