Update NCCL

2024-07-29 14:52:01 +08:00
parent a5640a9d45
commit ff034eb355
1 changed files with 96 additions and 6 deletions
--- a/src/qibotn/eval.py
+++ b/src/qibotn/eval.py
@@ -229,6 +229,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
    from cupy.cuda import nccl
    from cuquantum import Network
    from mpi4py import MPI
+    import cuquantum.cutensornet as cutn

    root = 0
    comm_mpi = MPI.COMM_WORLD
@@ -238,6 +239,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
    device_id = rank % getDeviceCount()

    cp.cuda.Device(device_id).use()
+    mempool = cp.get_default_memory_pool()

    # Set up the NCCL communicator.
    nccl_id = nccl.get_unique_id() if rank == root else None
@@ -245,16 +247,22 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
    comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)

    # Perform circuit conversion
-    myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
-    operands = myconvertor.expectation_operands(
-        pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
-    )
+    if rank==0:
+
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+        operands = myconvertor.expectation_operands(
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+        )
+    else:
+        operands = None
+    
+    operands = comm_mpi.bcast(operands, root)

    network = Network(*operands)

    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
    path, info = network.contract_path(
-        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
    )

    # Select the best path from all ranks.
@@ -291,7 +299,10 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
        root,
        stream_ptr,
    )
-
+    
+    del network
+    mempool.free_all_blocks()
+    
    return result, rank


@@ -426,3 +437,82 @@ def pauli_string_gen(nqubits, pauli_string_pattern):
        char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
        result += char_to_add
    return result
+
+def expectation_pauli_tn_MPI_pathfinding(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
+    """Convert qibo circuit to tensornet (TN) format and perform contraction to
+    expectation of given Pauli string using multi node and multi GPU through
+    MPI.
+
+    The conversion is performed by QiboCircuitToEinsum(), after which it
+    goes through 2 steps: pathfinder and execution. The
+    pauli_string_pattern is used to generate the pauli string
+    corresponding to the number of qubits of the system. The pathfinder
+    looks at user defined number of samples (n_samples) iteratively to
+    select the least costly contraction path. This is sped up with multi
+    thread. After pathfinding the optimal path is used in the actual
+    contraction to give an expectation value.
+
+    Parameters:
+        qibo_circ: The quantum circuit object.
+        datatype (str): Either single ("complex64") or double (complex128) precision.
+        pauli_string_pattern(str): pauli string pattern.
+        n_samples(int): Number of samples for pathfinding.
+
+    Returns:
+        Expectation of quantum circuit due to pauli string.
+    """
+    from cuquantum import Network
+    from mpi4py import MPI  # this line initializes MPI
+    import cuquantum.cutensornet as cutn
+    import time
+    import numpy as np
+    
+    root = 0
+    comm = MPI.COMM_WORLD
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    
+    # Assign the device for each process.
+    device_id = rank % getDeviceCount()
+    cp.cuda.Device(device_id).use()
+    mempool = cp.get_default_memory_pool()
+
+    # Perform circuit conversion
+    if rank==0:
+        myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
+
+        operands = myconvertor.expectation_operands(
+            pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
+        )
+    else:
+        operands = None
+    
+    operands = comm.bcast(operands, root)
+   
+    # Create network object.
+    network = Network(*operands, options={"device_id": device_id})
+    start_time = time.time()
+    # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
+    path, info = network.contract_path(
+        optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
+    )
+    end_time = time.time()
+
+    # print("Andy rank",rank,"info",info, info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time)
+    local_data = np.array([info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time])
+
+
+    # Initialize a list to store the gathered data on rank 0
+    if rank == 0:
+        gathered_data = np.zeros((size, 4))
+
+    else:
+        gathered_data = None
+
+    # Gather data from all ranks to rank 0
+    comm.Gather(local_data, gathered_data, root=0)
+    # print("Andy rank",rank,"gathered data",gathered_data)
+    del network
+    mempool.free_all_blocks()
+
+    return gathered_data, rank