Update dense_vector_tn_MPI

2024-10-04 14:33:30 +08:00
parent ff034eb355
commit 4cc59564cf
1 changed files with 51 additions and 97 deletions
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -64,6 +64,7 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):

    from cuquantum import Network
    from mpi4py import MPI
+    import cuquantum.cutensornet as cutn

    root = 0
    comm = MPI.COMM_WORLD
@@ -71,21 +72,31 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    size = comm.Get_size()

    device_id = rank % getDeviceCount()
-
+    cp.cuda.Device(device_id).use()
+    mempool = cp.get_default_memory_pool()
+    
    # Perform circuit conversion
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+    if rank == 0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)

-    operands = myconvertor.state_vector_operands()
+        operands = myconvertor.state_vector_operands()
+    else:
+        operands = None

-    # Assign the device for each process.
-    device_id = rank % getDeviceCount()
+    operands = comm.bcast(operands, root)

    # Create network object.
    network = Network(*operands, options={"device_id": device_id})

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
+        optimize={
+            "samples": n_samples,
+            "slicing": {
+                "min_slices": max(32, size),
+                "memory_model": cutn.MemoryModel.CUTENSOR,
+            },
+        }
    )

    # Select the best path from all ranks.
@@ -114,6 +125,9 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)

+    del network
+    mempool.free_all_blocks()
+    
    return result, rank


@@ -139,6 +153,7 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
    from cupy.cuda import nccl
    from cuquantum import Network
    from mpi4py import MPI
+    import cuquantum.cutensornet as cutn

    root = 0
    comm_mpi = MPI.COMM_WORLD
@@ -162,7 +177,13 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
+        optimize={
+            "samples": n_samples,
+            "slicing": {
+                "min_slices": max(32, size),
+                "memory_model": cutn.MemoryModel.CUTENSOR,
+            },
+        }
    )

    # Select the best path from all ranks.
@@ -247,7 +268,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)

    # Perform circuit conversion
-    if rank==0:
+    if rank == 0:

        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
        operands = myconvertor.expectation_operands(
@@ -255,14 +276,20 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
        )
    else:
        operands = None
-    
+
    operands = comm_mpi.bcast(operands, root)

    network = Network(*operands)

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
+        optimize={
+            "samples": n_samples,
+            "slicing": {
+                "min_slices": max(32, size),
+                "memory_model": cutn.MemoryModel.CUTENSOR,
+            },
+        }
    )

    # Select the best path from all ranks.
@@ -299,10 +326,10 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
        root,
        stream_ptr,
    )
-    
+
    del network
    mempool.free_all_blocks()
-    
+
    return result, rank


@@ -337,14 +364,14 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()
-    
+
    # Assign the device for each process.
    device_id = rank % getDeviceCount()
    cp.cuda.Device(device_id).use()
    mempool = cp.get_default_memory_pool()

    # Perform circuit conversion
-    if rank==0:
+    if rank == 0:
        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)

        operands = myconvertor.expectation_operands(
@@ -352,15 +379,21 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample
        )
    else:
        operands = None
-    
+
    operands = comm.bcast(operands, root)
-   
+
    # Create network object.
    network = Network(*operands, options={"device_id": device_id})

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
+        optimize={
+            "samples": n_samples,
+            "slicing": {
+                "min_slices": max(32, size),
+                "memory_model": cutn.MemoryModel.CUTENSOR,
+            },
+        }
    )

    # Select the best path from all ranks.
@@ -388,7 +421,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample

    # Sum the partial contribution from each process on root.
    result = comm.reduce(sendobj=result, op=MPI.SUM, root=root)
-    
+
    del network
    mempool.free_all_blocks()

@@ -437,82 +470,3 @@ def pauli_string_gen(nqubits, pauli_string_pattern):
        char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
        result += char_to_add
    return result
-
-def expectation_pauli_tn_MPI_pathfinding(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
-    """Convert qibo circuit to tensornet (TN) format and perform contraction to
-    expectation of given Pauli string using multi node and multi GPU through
-    MPI.
-
-    The conversion is performed by QiboCircuitToEinsum(), after which it
-    goes through 2 steps: pathfinder and execution. The
-    pauli_string_pattern is used to generate the pauli string
-    corresponding to the number of qubits of the system. The pathfinder
-    looks at user defined number of samples (n_samples) iteratively to
-    select the least costly contraction path. This is sped up with multi
-    thread. After pathfinding the optimal path is used in the actual
-    contraction to give an expectation value.
-
-    Parameters:
-        qibo_circ: The quantum circuit object.
-        datatype (str): Either single ("complex64") or double (complex128) precision.
-        pauli_string_pattern(str): pauli string pattern.
-        n_samples(int): Number of samples for pathfinding.
-
-    Returns:
-        Expectation of quantum circuit due to pauli string.
-    """
-    from cuquantum import Network
-    from mpi4py import MPI  # this line initializes MPI
-    import cuquantum.cutensornet as cutn
-    import time
-    import numpy as np
-    
-    root = 0
-    comm = MPI.COMM_WORLD
-    rank = comm.Get_rank()
-    size = comm.Get_size()
-    
-    # Assign the device for each process.
-    device_id = rank % getDeviceCount()
-    cp.cuda.Device(device_id).use()
-    mempool = cp.get_default_memory_pool()
-
-    # Perform circuit conversion
-    if rank==0:
-        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-
-        operands = myconvertor.expectation_operands(
-            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
-        )
-    else:
-        operands = None
-    
-    operands = comm.bcast(operands, root)
-   
-    # Create network object.
-    network = Network(*operands, options={"device_id": device_id})
-    start_time = time.time()
-    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
-    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
-    )
-    end_time = time.time()
-
-    # print("Andy rank",rank,"info",info, info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time)
-    local_data = np.array([info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time])
-
-
-    # Initialize a list to store the gathered data on rank 0
-    if rank == 0:
-        gathered_data = np.zeros((size, 4))
-
-    else:
-        gathered_data = None
-
-    # Gather data from all ranks to rank 0
-    comm.Gather(local_data, gathered_data, root=0)
-    # print("Andy rank",rank,"gathered data",gathered_data)
-    del network
-    mempool.free_all_blocks()
-
-    return gathered_data, rank