Update NCCL
This commit is contained in:
@@ -229,6 +229,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
|||||||
from cupy.cuda import nccl
|
from cupy.cuda import nccl
|
||||||
from cuquantum import Network
|
from cuquantum import Network
|
||||||
from mpi4py import MPI
|
from mpi4py import MPI
|
||||||
|
import cuquantum.cutensornet as cutn
|
||||||
|
|
||||||
root = 0
|
root = 0
|
||||||
comm_mpi = MPI.COMM_WORLD
|
comm_mpi = MPI.COMM_WORLD
|
||||||
@@ -238,6 +239,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
|||||||
device_id = rank % getDeviceCount()
|
device_id = rank % getDeviceCount()
|
||||||
|
|
||||||
cp.cuda.Device(device_id).use()
|
cp.cuda.Device(device_id).use()
|
||||||
|
mempool = cp.get_default_memory_pool()
|
||||||
|
|
||||||
# Set up the NCCL communicator.
|
# Set up the NCCL communicator.
|
||||||
nccl_id = nccl.get_unique_id() if rank == root else None
|
nccl_id = nccl.get_unique_id() if rank == root else None
|
||||||
@@ -245,16 +247,22 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
|||||||
comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
|
comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank)
|
||||||
|
|
||||||
# Perform circuit conversion
|
# Perform circuit conversion
|
||||||
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
if rank==0:
|
||||||
operands = myconvertor.expectation_operands(
|
|
||||||
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
||||||
)
|
operands = myconvertor.expectation_operands(
|
||||||
|
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
operands = None
|
||||||
|
|
||||||
|
operands = comm_mpi.bcast(operands, root)
|
||||||
|
|
||||||
network = Network(*operands)
|
network = Network(*operands)
|
||||||
|
|
||||||
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
||||||
path, info = network.contract_path(
|
path, info = network.contract_path(
|
||||||
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}}
|
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Select the best path from all ranks.
|
# Select the best path from all ranks.
|
||||||
@@ -292,6 +300,9 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl
|
|||||||
stream_ptr,
|
stream_ptr,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
del network
|
||||||
|
mempool.free_all_blocks()
|
||||||
|
|
||||||
return result, rank
|
return result, rank
|
||||||
|
|
||||||
|
|
||||||
@@ -426,3 +437,82 @@ def pauli_string_gen(nqubits, pauli_string_pattern):
|
|||||||
char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
|
char_to_add = pauli_string_pattern[i % len(pauli_string_pattern)]
|
||||||
result += char_to_add
|
result += char_to_add
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def expectation_pauli_tn_MPI_pathfinding(qibo_circ, datatype, pauli_string_pattern, n_samples=8):
|
||||||
|
"""Convert qibo circuit to tensornet (TN) format and perform contraction to
|
||||||
|
expectation of given Pauli string using multi node and multi GPU through
|
||||||
|
MPI.
|
||||||
|
|
||||||
|
The conversion is performed by QiboCircuitToEinsum(), after which it
|
||||||
|
goes through 2 steps: pathfinder and execution. The
|
||||||
|
pauli_string_pattern is used to generate the pauli string
|
||||||
|
corresponding to the number of qubits of the system. The pathfinder
|
||||||
|
looks at user defined number of samples (n_samples) iteratively to
|
||||||
|
select the least costly contraction path. This is sped up with multi
|
||||||
|
thread. After pathfinding the optimal path is used in the actual
|
||||||
|
contraction to give an expectation value.
|
||||||
|
|
||||||
|
Parameters:
|
||||||
|
qibo_circ: The quantum circuit object.
|
||||||
|
datatype (str): Either single ("complex64") or double (complex128) precision.
|
||||||
|
pauli_string_pattern(str): pauli string pattern.
|
||||||
|
n_samples(int): Number of samples for pathfinding.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Expectation of quantum circuit due to pauli string.
|
||||||
|
"""
|
||||||
|
from cuquantum import Network
|
||||||
|
from mpi4py import MPI # this line initializes MPI
|
||||||
|
import cuquantum.cutensornet as cutn
|
||||||
|
import time
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
root = 0
|
||||||
|
comm = MPI.COMM_WORLD
|
||||||
|
rank = comm.Get_rank()
|
||||||
|
size = comm.Get_size()
|
||||||
|
|
||||||
|
# Assign the device for each process.
|
||||||
|
device_id = rank % getDeviceCount()
|
||||||
|
cp.cuda.Device(device_id).use()
|
||||||
|
mempool = cp.get_default_memory_pool()
|
||||||
|
|
||||||
|
# Perform circuit conversion
|
||||||
|
if rank==0:
|
||||||
|
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
|
||||||
|
|
||||||
|
operands = myconvertor.expectation_operands(
|
||||||
|
pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
operands = None
|
||||||
|
|
||||||
|
operands = comm.bcast(operands, root)
|
||||||
|
|
||||||
|
# Create network object.
|
||||||
|
network = Network(*operands, options={"device_id": device_id})
|
||||||
|
start_time = time.time()
|
||||||
|
# Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction.
|
||||||
|
path, info = network.contract_path(
|
||||||
|
optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size),"memory_model":cutn.MemoryModel.CUTENSOR}}
|
||||||
|
)
|
||||||
|
end_time = time.time()
|
||||||
|
|
||||||
|
# print("Andy rank",rank,"info",info, info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time)
|
||||||
|
local_data = np.array([info.num_slices, info.opt_cost, info.largest_intermediate, end_time-start_time])
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize a list to store the gathered data on rank 0
|
||||||
|
if rank == 0:
|
||||||
|
gathered_data = np.zeros((size, 4))
|
||||||
|
|
||||||
|
else:
|
||||||
|
gathered_data = None
|
||||||
|
|
||||||
|
# Gather data from all ranks to rank 0
|
||||||
|
comm.Gather(local_data, gathered_data, root=0)
|
||||||
|
# print("Andy rank",rank,"gathered data",gathered_data)
|
||||||
|
del network
|
||||||
|
mempool.free_all_blocks()
|
||||||
|
|
||||||
|
return gathered_data, rank
|
||||||
|
|||||||
Reference in New Issue
Block a user