Update NCCL
This commit is contained in:
@@ -229,6 +229,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
||||
from cupy.cuda import nccl
|
||||
from cuquantum import Network
|
||||
from mpi4py import MPI
|
||||
import cuquantum.cutensornet as cutn
|
||||
|
||||
root = 0
|
||||
comm_mpi = MPI.COMM_WORLD
|
||||
@@ -238,6 +239,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
||||
device_id = rank % getDeviceCount()
|
||||
|
||||
cp.cuda.Device(device_id).use()
|
||||
mempool = cp.get_default_memory_pool()
|
||||
|
||||
# Set up the NCCL communicator.
|
||||
nccl_id = nccl.get_unique_id() if rank == root else None
|
||||
@@ -245,16 +247,22 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
||||
comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
|
||||
|
||||
# Perform circuit conversion
|
||||
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
||||
operands = myconvertor.expectation_operands(
|
||||
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
||||
)
|
||||
if rank==0:
|
||||
|
||||
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
||||
operands = myconvertor.expectation_operands(
|
||||
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
||||
)
|
||||
else:
|
||||
operands = None
|
||||
|
||||
operands = comm_mpi.bcast(operands, root)
|
||||
|
||||
network = Network(*operands)
|
||||
|
||||
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
||||
path, info = network.contract_path(
|
||||
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
|
||||
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
|
||||
)
|
||||
|
||||
# Select the best path from all ranks.
|
||||
@@ -291,7 +299,10 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
||||
root,
|
||||
stream_ptr,
|
||||
)
|
||||
|
||||
|
||||
del network
|
||||
mempool.free_all_blocks()
|
||||
|
||||
return result, rank
|
||||
|
||||
|
||||
@@ -426,3 +437,82 @@ def pauli_string_gen(nqubits, pauli_string_pattern):
|
||||
char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
|
||||
result += char_to_add
|
||||
return result
|
||||
|
||||
def expectation_pauli_tn_MPI_pathfinding(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
|
||||
"""Convert qibo circuit to tensornet (TN) format and perform contraction to
|
||||
expectation of given Pauli string using multi node and multi GPU through
|
||||
MPI.
|
||||
|
||||
The conversion is performed by QiboCircuitToEinsum(), after which it
|
||||
goes through 2 steps: pathfinder and execution. The
|
||||
pauli_string_pattern is used to generate the pauli string
|
||||
corresponding to the number of qubits of the system. The pathfinder
|
||||
looks at user defined number of samples (n_samples) iteratively to
|
||||
select the least costly contraction path. This is sped up with multi
|
||||
thread. After pathfinding the optimal path is used in the actual
|
||||
contraction to give an expectation value.
|
||||
|
||||
Parameters:
|
||||
qibo_circ: The quantum circuit object.
|
||||
datatype (str): Either single ("complex64") or double (complex128) precision.
|
||||
pauli_string_pattern(str): pauli string pattern.
|
||||
n_samples(int): Number of samples for pathfinding.
|
||||
|
||||
Returns:
|
||||
Expectation of quantum circuit due to pauli string.
|
||||
"""
|
||||
from cuquantum import Network
|
||||
from mpi4py import MPI # this line initializes MPI
|
||||
import cuquantum.cutensornet as cutn
|
||||
import time
|
||||
import numpy as np
|
||||
|
||||
root = 0
|
||||
comm = MPI.COMM_WORLD
|
||||
rank = comm.Get_rank()
|
||||
size = comm.Get_size()
|
||||
|
||||
# Assign the device for each process.
|
||||
device_id = rank % getDeviceCount()
|
||||
cp.cuda.Device(device_id).use()
|
||||
mempool = cp.get_default_memory_pool()
|
||||
|
||||
# Perform circuit conversion
|
||||
if rank==0:
|
||||
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
||||
|
||||
operands = myconvertor.expectation_operands(
|
||||
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
||||
)
|
||||
else:
|
||||
operands = None
|
||||
|
||||
operands = comm.bcast(operands, root)
|
||||
|
||||
# Create network object.
|
||||
network = Network(*operands, options={"device_id": device_id})
|
||||
start_time = time.time()
|
||||
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
||||
path, info = network.contract_path(
|
||||
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
|
||||
)
|
||||
end_time = time.time()
|
||||
|
||||
# print("Andy rank",rank,"info",info, info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time)
|
||||
local_data = np.array([info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time])
|
||||
|
||||
|
||||
# Initialize a list to store the gathered data on rank 0
|
||||
if rank == 0:
|
||||
gathered_data = np.zeros((size, 4))
|
||||
|
||||
else:
|
||||
gathered_data = None
|
||||
|
||||
# Gather data from all ranks to rank 0
|
||||
comm.Gather(local_data, gathered_data, root=0)
|
||||
# print("Andy rank",rank,"gathered data",gathered_data)
|
||||
del network
|
||||
mempool.free_all_blocks()
|
||||
|
||||
return gathered_data, rank
|
||||
|
||||
Reference in New Issue
Block a user