From 72f95599bb628d185bb6a809319cd911a0e62f94 Mon Sep 17 00:00:00 2001 From: jaunatisblue Date: Tue, 12 May 2026 15:44:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AE=8C=E5=96=84mps=E7=9A=84vidal=E6=9C=BA?= =?UTF-8?q?=E5=88=B6,=E5=A4=9A=E8=8A=82=E7=82=B9=E5=B9=B6=E8=A1=8C;?= =?UTF-8?q?=E8=A1=A5=E5=85=85tn=E6=90=9C=E7=B4=A2=E6=97=B6dask=E9=9B=86?= =?UTF-8?q?=E7=BE=A4=E6=90=9C=E7=B4=A2=E7=9A=84=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 15 +- README.md | 14 +- benchmark_cpu_expectation.py | 145 ++++++ pyproject.toml | 2 + run_vidal_mps_cases.sh | 134 +++++ src/qibotn/backends/__init__.py | 9 +- src/qibotn/backends/cpu.py | 415 ++++++++++++++++ src/qibotn/backends/quimb.py | 125 ++++- src/qibotn/backends/vidal.py | 250 ++++++++++ src/qibotn/backends/vidal_mpi_segment.py | 338 +++++++++++++ src/qibotn/backends/vidal_tebd.py | 466 +++++++++++------- src/qibotn/benchmark_cases.py | 151 ++++++ src/qibotn/expectation_runner.py | 71 +++ src/qibotn/observables.py | 37 ++ src/qibotn/parallel.py | 429 +++++++++++++--- tests/test_cpu_backend.py | 130 +++++ tests/test_parallel.py | 46 ++ tests/test_vidal_backend.py | 167 +++++++ tools/README.md | 18 + .../baseline_mps_expectation.py | 131 ++--- .../benchmark_contract_sliced.py | 0 .../benchmark_search.py | 0 .../benchmark_slice.py | 0 .../benchmark_tn_mpi.py | 0 check_tree.py => tools/check_tree.py | 0 tools/compare_vidal_backend_qmatchatea.py | 137 +++++ tools/profile_vidal_chrome.py | 72 +++ .../qibojit_reference_expectation.py | 0 tools/run_cpu_large_cases.sh | 127 +++++ tools/run_cpu_single_cases.sh | 148 ++++++ tools/run_vidal_segment_mpi_scan.sh | 70 +++ tools/validate_vidal_mpi_correctness.py | 202 ++++++++ 32 files changed, 3529 insertions(+), 320 deletions(-) create mode 100644 benchmark_cpu_expectation.py create mode 100755 run_vidal_mps_cases.sh create mode 100644 src/qibotn/backends/cpu.py create mode 100644 src/qibotn/backends/vidal.py create mode 100644 src/qibotn/backends/vidal_mpi_segment.py create mode 100644 src/qibotn/benchmark_cases.py create mode 100644 src/qibotn/expectation_runner.py create mode 100644 tests/test_cpu_backend.py create mode 100644 tests/test_parallel.py create mode 100644 tests/test_vidal_backend.py create mode 100644 tools/README.md rename baseline_mps_expectation.py => tools/baseline_mps_expectation.py (54%) rename benchmark_contract_sliced.py => tools/benchmark_contract_sliced.py (100%) rename benchmark_search.py => tools/benchmark_search.py (100%) rename benchmark_slice.py => tools/benchmark_slice.py (100%) rename benchmark_tn_mpi.py => tools/benchmark_tn_mpi.py (100%) rename check_tree.py => tools/check_tree.py (100%) create mode 100644 tools/compare_vidal_backend_qmatchatea.py create mode 100644 tools/profile_vidal_chrome.py rename qibojit_reference_expectation.py => tools/qibojit_reference_expectation.py (100%) create mode 100755 tools/run_cpu_large_cases.sh create mode 100755 tools/run_cpu_single_cases.sh create mode 100755 tools/run_vidal_segment_mpi_scan.sh create mode 100644 tools/validate_vidal_mpi_correctness.py diff --git a/.gitignore b/.gitignore index 2705f4d..20f223d 100644 --- a/.gitignore +++ b/.gitignore @@ -5,11 +5,6 @@ __pycache__/ data/ # C extensions *.so -bak/ -path/ -profiles/ -vtune_expval/ -perf* # Distribution / packaging .Python build/ @@ -164,3 +159,13 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ .devenv + + +# yx +bak/ +path/ +profiles/ +vtune_expval/ +perf* +experiments/ +references/ \ No newline at end of file diff --git a/README.md b/README.md index d587293..150b8e8 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ Tensor Network Types: Tensor Network contractions to: - dense vectors -- expecation values of given Pauli string +- expectation values of given Pauli strings or Pauli-sum observables The supported HPC configurations are: @@ -26,6 +26,18 @@ Currently, the supported tensor network libraries are: - [cuQuantum](https://github.com/NVIDIA/cuQuantum), an NVIDIA SDK of optimized libraries and tools for accelerating quantum computing workflows. - [quimb](https://quimb.readthedocs.io/en/latest/), an easy but fast python library for ‘quantum information many-body’ calculations, focusing primarily on tensor networks. +## CPU expectation benchmarks + +The current CPU expectation entrypoint is: + +```sh +python -u benchmark_cpu_expectation.py --ansatz mps --nqubits 40 --nlayers 10 --bond 2048 --circuits brickwall_cnot --observables ring_xz +``` + +Use `--ansatz tn` for the generic TN path and `--mpi` under `mpiexec` for MPI runs. +Reusable circuit and observable builders live in `src/qibotn/benchmark_cases.py`; execution logic lives in `src/qibotn/expectation_runner.py`. +For Vidal/MPS 1D-chain scale tests, use `run_vidal_mps_cases.sh`. + ## Installation To get started: diff --git a/benchmark_cpu_expectation.py b/benchmark_cpu_expectation.py new file mode 100644 index 0000000..9141849 --- /dev/null +++ b/benchmark_cpu_expectation.py @@ -0,0 +1,145 @@ +"""CLI for CPU TN/MPS expectation benchmarks.""" + +from __future__ import annotations + +import argparse + +from qibotn.benchmark_cases import ( + CIRCUITS, + OBSERVABLES, + build_circuit, + observable_terms, + parse_names, + terms_to_dict, +) +from qibotn.expectation_runner import ( + ExpectationConfig, + exact_for_observable, + run_cpu_expectation, +) + + +def build_parallel_opts(args): + slicing_opts = {} + if args.tn_target_slices is not None: + slicing_opts["target_slices"] = args.tn_target_slices + if args.tn_target_size is not None: + slicing_opts["target_size"] = args.tn_target_size + + opts = { + "slicing_opts": slicing_opts or None, + "search_workers": args.tn_search_workers or args.torch_threads, + "max_repeats": args.tn_search_repeats, + "max_time": args.tn_search_time, + } + if args.tn_search_backend is not None: + opts["search_backend"] = args.tn_search_backend + if args.dask_address is not None: + opts["dask_address"] = args.dask_address + return opts + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--nqubits", type=int, default=40) + parser.add_argument("--nlayers", type=int, default=30) + parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=1024) + parser.add_argument("--cut-ratio", type=float, default=1e-12) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--torch-threads", type=int, default=8) + parser.add_argument("--ansatz", choices=("tn", "mps"), default=None) + parser.add_argument("--mps", action="store_true") + parser.add_argument("--mpi", action="store_true") + parser.add_argument("--exact", action="store_true") + parser.add_argument("--exact-max-qubits", type=int, default=24) + parser.add_argument("--circuits", nargs="+", default=["brickwall_cnot"]) + parser.add_argument("--observables", nargs="+", default=["ring_xz"]) + parser.add_argument("--pauli-pattern") + parser.add_argument("--tn-target-slices", type=int) + parser.add_argument("--tn-target-size", type=int) + parser.add_argument("--tn-search-workers", type=int) + parser.add_argument("--tn-search-repeats", type=int, default=128) + parser.add_argument("--tn-search-time", type=float, default=60.0) + parser.add_argument( + "--tn-search-backend", + choices=("processpool", "dask"), + help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.", + ) + parser.add_argument( + "--dask-address", + help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.", + ) + args = parser.parse_args() + + ansatz = "mps" if args.mps else (args.ansatz or "tn") + circuits = parse_names(args.circuits, CIRCUITS, "circuits") + observables = [] if args.pauli_pattern else parse_names( + args.observables, OBSERVABLES, "observables" + ) + + rank = 0 + if args.mpi: + from mpi4py import MPI + + rank = MPI.COMM_WORLD.Get_rank() + + config = ExpectationConfig( + ansatz=ansatz, + mpi=args.mpi, + bond=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + torch_threads=args.torch_threads, + parallel_opts=build_parallel_opts(args), + ) + + if rank == 0: + mode = "MPI" if args.mpi else "serial" + print( + f"backend=cpu ansatz={ansatz.upper()} mode={mode} " + f"nqubits={args.nqubits} nlayers={args.nlayers} " + f"bond={args.bond} cut_ratio={args.cut_ratio:g} seed={args.seed} " + f"torch_threads={args.torch_threads} " + f"tn_search_backend={args.tn_search_backend or 'processpool'}" + ) + print("circuit observable exact value abs_error rel_error seconds") + + for circuit_kind in circuits: + circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed) + named_observables = ( + [(f"pattern:{args.pauli_pattern}", {"pauli_string_pattern": args.pauli_pattern})] + if args.pauli_pattern + else [ + (obs_kind, terms_to_dict(observable_terms(obs_kind, args.nqubits))) + for obs_kind in observables + ] + ) + + for obs_name, observable in named_observables: + exact = None + if args.exact and rank == 0: + if args.nqubits > args.exact_max_qubits: + raise ValueError( + f"--exact is limited to {args.exact_max_qubits} qubits by default." + ) + exact = exact_for_observable(circuit, observable, args.nqubits) + + result = run_cpu_expectation(circuit, observable, config) + if args.mpi and result.rank != 0: + continue + + abs_error = float("nan") if exact is None else abs(result.value - exact) + rel_error = ( + float("nan") + if exact is None + else abs_error / max(abs(exact), 1e-15) + ) + exact_text = "nan" if exact is None else f"{exact:.16e}" + print( + f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} " + f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}" + ) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index f631f18..95e3de9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,11 +31,13 @@ cuquantum-python-cu12 = { version = "^25.9.1", optional = true } qmatchatea = { version = "^1.4.3", optional = true } qiskit = { version = "^1.4.0", optional = true } qtealeaves = { version = "^1.5.20", optional = true } +distributed = { version = ">=2024", optional = true } [tool.poetry.extras] cuda = ["cupy-cuda12x", "cuda-toolkit", "nvidia-nccl-cu12", "cuquantum-python-cu12", "mpi4py"] qmatchatea = ["qmatchatea"] +dask = ["distributed"] [tool.poetry.group.docs] optional = true diff --git a/run_vidal_mps_cases.sh b/run_vidal_mps_cases.sh new file mode 100755 index 0000000..66db610 --- /dev/null +++ b/run_vidal_mps_cases.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Focused Vidal/MPS expectation test cases for 1D chain circuits. +# +# These cases intentionally avoid qmatchatea and generic TN paths. They target +# the current supported scope: one-qubit gates, adjacent two-qubit gates, and +# Pauli-sum expectation values on a 1D chain. + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$ROOT_DIR" + +PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" +MPIEXEC="${MPIEXEC:-mpiexec}" +HOSTFILE="${HOSTFILE:-hostfile}" + +THREADS="${THREADS:-32}" +MPI_RANKS="${MPI_RANKS:-16}" +MPI_THREADS="${MPI_THREADS:-12}" + +export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}" +export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}" + +run() { + echo + echo "--------------------------------------------------------------------------------" + echo "$*" + echo "--------------------------------------------------------------------------------" + "$@" +} + +case "${1:-help}" in + smoke) + # Short correctness-oriented run. Useful before starting long jobs. + run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + --mps \ + --nqubits 40 \ + --nlayers 10 \ + --bond 2048 \ + --torch-threads "$THREADS" \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz range2_xx long_z_string + ;; + + convergence) + # Same circuit/observable, increasing bond. Check value convergence. + for bond in ${BONDS:-4096 16384 65536}; do + run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + --mps \ + --nqubits "${NQ:-80}" \ + --nlayers "${LAYERS:-16}" \ + --bond "$bond" \ + --torch-threads "$THREADS" \ + --circuits "${CIRCUIT:-brickwall_cnot}" \ + --observables "${OBSERVABLE:-ring_xz}" + done + ;; + + single-long) + # Single long Vidal run. On node-3, a similar n=40,l=30,bond=2048 case + # took about 9 minutes for one expectation. This one is meant to be longer. + run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + --mps \ + --nqubits "${NQ:-80}" \ + --nlayers "${LAYERS:-16}" \ + --bond "${BOND:-65536}" \ + --torch-threads "$THREADS" \ + --circuits "${CIRCUIT:-brickwall_cnot}" \ + --observables "${OBSERVABLE:-ring_xz}" + ;; + + suite-long) + # Application-style multi-circuit, multi-observable MPS run. + # This is intentionally multi-term and should run much longer than single-long. + run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + --mps \ + --nqubits "${NQ:-80}" \ + --nlayers "${LAYERS:-16}" \ + --bond "${BOND:-65536}" \ + --torch-threads "$THREADS" \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz mixed_local range2_xx long_z_string + ;; + + mpi-long) + # Multi-node Vidal segmented MPS run. Uses HOSTFILE. + run "$MPIEXEC" -hostfile "$HOSTFILE" -n "$MPI_RANKS" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + --mpi --mps \ + --nqubits "${NQ:-80}" \ + --nlayers "${LAYERS:-16}" \ + --bond "${BOND:-65536}" \ + --torch-threads "$MPI_THREADS" \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz mixed_local range2_xx long_z_string + ;; + + stress) + # Heavier entanglement. Start only after single-long is stable. + run "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + --mps \ + --nqubits "${NQ:-80}" \ + --nlayers "${LAYERS:-18}" \ + --bond "${BOND:-262144}" \ + --torch-threads "${THREADS:-48}" \ + --circuits "${CIRCUIT:-rxx_rzz}" \ + --observables ring_xz open_zz range2_xx + ;; + + help|*) + cat <<'EOF' +Usage: ./run_vidal_mps_cases.sh [smoke|convergence|single-long|suite-long|mpi-long|stress] + +Common overrides: + PYTHON_BIN=.venv/bin/python + THREADS=32 + OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 + +Single-node scale overrides: + NQ=80 LAYERS=16 BOND=65536 + CIRCUIT=brickwall_cnot + OBSERVABLE=ring_xz + BONDS="4096 16384 65536" # for convergence mode + +Multi-node overrides: + HOSTFILE=hostfile + MPI_RANKS=16 MPI_THREADS=12 + +Recommended first runs: + ./run_vidal_mps_cases.sh smoke + ./run_vidal_mps_cases.sh convergence + ./run_vidal_mps_cases.sh single-long +EOF + ;; +esac diff --git a/src/qibotn/backends/__init__.py b/src/qibotn/backends/__init__.py index c330d1f..f2c1189 100644 --- a/src/qibotn/backends/__init__.py +++ b/src/qibotn/backends/__init__.py @@ -3,9 +3,10 @@ from typing import Union from qibo.config import raise_error from qibotn.backends.abstract import QibotnBackend +from qibotn.backends.cpu import CpuTensorNet from qibotn.backends.cutensornet import CuTensorNet # pylint: disable=E0401 -PLATFORMS = ("cutensornet", "quimb", "qmatchatea") +PLATFORMS = ("cutensornet", "cpu", "quimb", "qmatchatea", "vidal") class MetaBackend: @@ -24,6 +25,8 @@ class MetaBackend: if platform == "cutensornet": # pragma: no cover return CuTensorNet(runcard) + elif platform == "cpu": + return CpuTensorNet(runcard) elif platform == "quimb": # pragma: no cover import qibotn.backends.quimb as qmb @@ -36,6 +39,10 @@ class MetaBackend: from qibotn.backends.qmatchatea import QMatchaTeaBackend return QMatchaTeaBackend() + elif platform == "vidal": + from qibotn.backends.vidal import VidalBackend + + return VidalBackend() else: raise_error( NotImplementedError, diff --git a/src/qibotn/backends/cpu.py b/src/qibotn/backends/cpu.py new file mode 100644 index 0000000..0abe8f7 --- /dev/null +++ b/src/qibotn/backends/cpu.py @@ -0,0 +1,415 @@ +"""CPU tensor-network backend with cutensornet-style runcard support.""" + +from __future__ import annotations + +import os + +import numpy as np +from qibo import hamiltonians +from qibo.backends import NumpyBackend +from qibo.config import raise_error + +from qibotn.backends.abstract import QibotnBackend +from qibotn.backends.vidal import _symbolic_hamiltonian_to_pauli_terms, _unsupported_reason +from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor +from qibotn.backends.vidal_tebd import VidalTEBDExecutor +from qibotn.observables import check_observable +from qibotn.result import TensorNetworkResult + + +def _as_bool_or_dict(value, name): + if isinstance(value, (bool, dict)): + return value + raise TypeError(f"{name} has an unexpected type") + + +def _parse_cpu_list(text): + cpus = set() + for item in text.split(","): + item = item.strip() + if not item: + continue + if "-" in item: + start, stop = item.split("-", 1) + cpus.update(range(int(start), int(stop) + 1)) + else: + cpus.add(int(item)) + return cpus + + +class CpuTensorNet(QibotnBackend, NumpyBackend): + """CPU replacement for the cutensornet runcard execution surface. + + The backend preserves the high-level runcard knobs used by the GPU backend: + ``MPI_enabled``, ``MPS_enabled`` and ``expectation_enabled``. Generic TN + work is delegated to quimb on CPU; MPS expectation uses the Vidal fast path + when the circuit is nearest-neighbor and falls back to quimb otherwise. + """ + + def __init__(self, runcard=None): + super().__init__() + self.name = "qibotn" + self.platform = "cpu" + self.precision = "double" + self.configure_tn_simulation(runcard) + + def configure_tn_simulation(self, runcard=None): + runcard = {} if runcard is None else runcard + self.rank = 0 + self.MPI_enabled = bool(runcard.get("MPI_enabled", False)) + self.NCCL_enabled = bool(runcard.get("NCCL_enabled", False)) + if self.NCCL_enabled: + raise_error(NotImplementedError, "NCCL is only available for GPU backends.") + + expectation = runcard.get("expectation_enabled", False) + if expectation is True: + self.expectation_enabled = True + self.observable = None + elif expectation is False: + self.expectation_enabled = False + self.observable = None + elif isinstance(expectation, (dict, hamiltonians.SymbolicHamiltonian)): + self.expectation_enabled = True + self.observable = expectation + else: + raise TypeError("expectation_enabled has an unexpected type") + + mps = _as_bool_or_dict(runcard.get("MPS_enabled", False), "MPS_enabled") + self.MPS_enabled = bool(mps) + self.mps_options = mps if isinstance(mps, dict) else {} + + self.max_bond_dimension = runcard.get( + "max_bond_dimension", + self.mps_options.get("max_bond_dimension", 512), + ) + self.cut_ratio = runcard.get( + "cut_ratio", + self.mps_options.get( + "cut_ratio", + self.mps_options.get("svd_method", {}).get("abs_cutoff", 1e-12), + ), + ) + self.tensor_module = runcard.get("tensor_module", "torch") + self.torch_threads = runcard.get("torch_threads", None) + self.quimb_backend = runcard.get("quimb_backend", "numpy") + self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq") + self.parallel_opts = runcard.get("parallel_opts", {}) + self.parallel_stats = [] + + def execute_circuit( + self, + circuit, + initial_state=None, + nshots=None, + prob_type=None, + return_array=False, + **prob_kwargs, + ): + if initial_state is not None: + raise_error(NotImplementedError, "QiboTN CPU backend does not support initial state.") + + if self.torch_threads is not None and self.tensor_module == "torch": + import torch + + torch.set_num_threads(self.torch_threads) + + if self.expectation_enabled: + value = self.expectation(circuit, self.observable) + if self.MPI_enabled and self.rank > 0: + return np.asarray([0], dtype=np.int64) + return np.asarray([value], dtype=np.float64) + + backend = self._quimb_backend() + backend.configure_tn_simulation( + ansatz="mps" if self.MPS_enabled else None, + max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None, + svd_cutoff=self.cut_ratio, + ) + return backend.execute_circuit( + circuit=circuit, + nshots=nshots, + return_array=return_array, + ) + + def expectation(self, circuit, observable=None, preprocess=True, compile_circuit=None): + observable = check_observable(observable, circuit.nqubits) + + if self.MPS_enabled: + reason = _unsupported_reason(circuit) + if reason is None: + return self._vidal_expectation(circuit, observable) + + backend = self._quimb_backend() + backend.configure_tn_simulation( + ansatz="mps" if self.MPS_enabled else None, + max_bond_dimension=self.max_bond_dimension if self.MPS_enabled else None, + svd_cutoff=self.cut_ratio, + ) + if self.MPI_enabled: + return self._quimb_expectation_mpi(backend, circuit, observable) + return self._quimb_expectation_processpool(backend, circuit, observable) + + def _vidal_expectation(self, circuit, observable): + terms = _symbolic_hamiltonian_to_pauli_terms(observable) + if self.MPI_enabled: + from mpi4py import MPI + + comm = MPI.COMM_WORLD + self.rank = comm.Get_rank() + executor = SegmentVidalMPIExecutor( + nqubits=circuit.nqubits, + max_bond=self.max_bond_dimension, + cut_ratio=self.cut_ratio, + tensor_module=self.tensor_module, + comm=comm, + ) + executor.run_circuit(circuit) + value = executor.expectation_pauli_sum_root(terms) + return np.nan if self.rank != 0 else float(np.real(value)) + + executor = VidalTEBDExecutor( + nqubits=circuit.nqubits, + max_bond=self.max_bond_dimension, + cut_ratio=self.cut_ratio, + tensor_module=self.tensor_module, + ) + executor.run_circuit(circuit) + return float(np.real(executor.expectation_pauli_sum(terms))) + + def _quimb_backend(self): + import qibotn.backends.quimb as qmb + + return qmb.BACKENDS[self.quimb_backend]( + quimb_backend=self.quimb_backend, + contraction_optimizer=self.contraction_optimizer, + ) + + def _bind_rank_to_numa_domain(self, rank): + domain = rank % 2 + cpulist = f"/sys/devices/system/node/node{domain}/cpulist" + try: + cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip()) + if cpus: + os.sched_setaffinity(0, cpus) + except (FileNotFoundError, OSError): + pass + + try: + import ctypes + + libnuma = ctypes.CDLL("libnuma.so.1") + if libnuma.numa_available() >= 0: + libnuma.numa_run_on_node(ctypes.c_int(domain)) + libnuma.numa_set_preferred(ctypes.c_int(domain)) + except Exception: + pass + + self.numa_domain = domain + + def _default_search_workers(self, nranks=1): + if self.torch_threads: + return max(1, int(self.torch_threads)) + return max(1, (os.cpu_count() or 1) // max(1, nranks)) + + def _quimb_expectation_processpool(self, backend, circuit, observable): + return self._quimb_expectation_search( + backend, + circuit, + observable, + method="processpool", + comm=None, + ) + + def _quimb_expectation_mpi(self, backend, circuit, observable): + from mpi4py import MPI + + comm = MPI.COMM_WORLD + self.rank = comm.Get_rank() + self._bind_rank_to_numa_domain(self.rank) + + return self._quimb_expectation_search( + backend, + circuit, + observable, + method="mpi", + comm=comm, + ) + + def _quimb_expectation_search(self, backend, circuit, observable, method, comm=None): + rank = comm.Get_rank() if comm is not None else 0 + size = comm.Get_size() if comm is not None else 1 + self.rank = rank + + from qibotn.observables import extract_gates_and_qubits + from qibotn.parallel import ( + contraction_tree_costs, + parallel_contract, + parallel_path_search, + ) + from qibotn.backends.quimb import ( + PAULI_DENSE_MAX_QUBITS, + _pauli_term_to_dense_operator, + pauli_product_expectation_tn, + ) + + opts = dict(self.parallel_opts) + user_slicing_opts = opts.get("slicing_opts") + search_workers = opts.get("search_workers", self._default_search_workers(size)) + search_repeats = opts.get("max_repeats", 128) + search_time = opts.get("max_time", 60) + search_backend = opts.get("search_backend") + dask_address = opts.get("dask_address") + + qc = backend._qibo_circuit_to_quimb( + circuit, + quimb_circuit_type=backend.circuit_ansatz, + gate_opts={ + "max_bond": self.max_bond_dimension, + "cutoff": self.cut_ratio, + }, + ) + + total_value = 0.0 + 0.0j + terms = extract_gates_and_qubits(observable) + for coeff, factors in terms: + if not factors: + if self.rank == 0: + total_value += coeff + continue + + if len(factors) > PAULI_DENSE_MAX_QUBITS: + tn = pauli_product_expectation_tn( + qc, + factors, + simplify_sequence="ADCRS", + simplify_atol=1e-12, + ) + else: + op, where = _pauli_term_to_dense_operator(factors) + tn = qc.local_expectation( + op, + where, + rehearse="tn", + simplify_sequence="ADCRS", + simplify_atol=1e-12, + ) + slicing_opts = self._mpi_slicing_opts( + user_slicing_opts, + ) + + tree = parallel_path_search( + tn, + tn.outer_inds(), + method="dask" if method != "mpi" and search_backend == "dask" else method, + total_repeats=search_repeats, + max_time=search_time, + n_workers=search_workers, + slicing_opts=slicing_opts, + trial_timeout=opts.get("trial_timeout"), + search_backend=search_backend, + dask_address=dask_address, + ) + if tree is None: + raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.") + + if int(getattr(tree, "multiplicity", 1)) <= 1: + if self.rank == 0: + value = self._contract_term_unsliced(tn, tree, backend) + self.parallel_stats.append( + { + "term_factors": tuple(factors), + "path_cost": contraction_tree_costs(tree), + "tree_slices": 1, + "slice_assignment": "root", + "rank_slices": [1] + [0] * (size - 1), + "search_workers": search_workers, + "search_repeats": search_repeats, + "search_time": search_time, + "search_backend": search_backend or method, + "dask_address": dask_address, + "numa_domain": getattr(self, "numa_domain", None), + } + ) + total_value += coeff * complex(value) + continue + + if comm is None: + value = self._contract_term_unsliced(tn, tree, backend) + self.parallel_stats.append( + { + "term_factors": tuple(factors), + "path_cost": contraction_tree_costs(tree), + "tree_slices": int(getattr(tree, "multiplicity", 1)), + "slice_assignment": "local", + "rank_slices": [int(getattr(tree, "multiplicity", 1))], + "search_workers": search_workers, + "search_repeats": search_repeats, + "search_time": search_time, + "search_backend": search_backend or method, + "dask_address": dask_address, + "numa_domain": getattr(self, "numa_domain", None), + } + ) + total_value += coeff * complex(np.asarray(value).reshape(-1)[0]) + continue + + arrays = self._term_arrays(tn, backend) + value, stats = parallel_contract( + tree, + arrays, + method="mpi", + comm=comm, + return_stats=True, + ) + gathered_stats = comm.gather(stats, root=0) + if rank == 0: + self.parallel_stats.append( + { + "term_factors": tuple(factors), + "path_cost": contraction_tree_costs(tree), + "tree_slices": stats.nslices, + "slice_assignment": stats.assignment, + "rank_slices": [ + item.local_slices for item in gathered_stats + ], + "search_workers": search_workers, + "search_repeats": search_repeats, + "search_time": search_time, + "search_backend": search_backend or method, + "dask_address": dask_address, + "numa_domain": getattr(self, "numa_domain", None), + } + ) + total_value += coeff * complex(np.asarray(value).reshape(-1)[0]) + + return np.nan if rank != 0 else float(np.real(total_value)) + + def _contract_term_unsliced(self, tn, tree, backend): + if backend.backend == "torch": + import torch + from qibotn.backends.quimb import _torch_cpu_array + + for tensor in tn.tensors: + tensor._data = _torch_cpu_array(tensor._data, dtype=torch.complex128) + return tn.contract(all, output_inds=(), optimize=tree, backend="torch") + + return tn.contract( + all, + output_inds=(), + optimize=tree, + backend=backend.backend, + ) + + def _mpi_slicing_opts(self, user_slicing_opts): + return None if user_slicing_opts is None else dict(user_slicing_opts) + + def _term_arrays(self, tn, backend): + if backend.backend == "torch": + import torch + from qibotn.backends.quimb import _torch_cpu_array + + return [ + _torch_cpu_array(array, dtype=torch.complex128) + for array in tn.arrays + ] + return [backend.engine.asarray(array) for array in tn.arrays] diff --git a/src/qibotn/backends/quimb.py b/src/qibotn/backends/quimb.py index 5d81606..ea894d9 100644 --- a/src/qibotn/backends/quimb.py +++ b/src/qibotn/backends/quimb.py @@ -37,6 +37,8 @@ GATE_MAP = { "measure": "measure", } +PAULI_DENSE_MAX_QUBITS = 8 + def _torch_cpu_array(data, dtype=None): """Convert array-like data to a contiguous CPU torch tensor.""" @@ -68,6 +70,81 @@ def _arrays_to_backend(arrays, backend, engine): return [engine.asarray(array) for array in arrays] +def _pauli_term_to_dense_operator(factors): + op = None + where = [] + for qubit, gate_name in factors: + pauli = qu.pauli(gate_name.lower()) + op = pauli if op is None else op & pauli + where.append(qubit) + return op, tuple(where) + + +def pauli_product_expectation_tn( + quimb_circuit, + factors, + simplify_sequence="ADCRS", + simplify_atol=1e-12, + simplify_equalize_norms=True, +): + """Build the scalar TN for ```` without dense Pauli strings.""" + import numpy as np + + op_by_site = { + int(qubit): qu.pauli(str(gate_name).lower()) + for qubit, gate_name in factors + if str(gate_name).upper() != "I" + } + ket = quimb_circuit.get_psi_simplified( + seq=simplify_sequence, + atol=simplify_atol, + equalize_norms=simplify_equalize_norms, + ) + bra = ket.conj().reindex( + { + quimb_circuit.ket_site_ind(qubit): quimb_circuit.bra_site_ind(qubit) + for qubit in range(quimb_circuit.N) + } + ) + + tn = bra | ket + identity = np.eye(2, dtype=complex) + for qubit in range(quimb_circuit.N): + data = op_by_site.get(qubit, identity) + tn |= qtn.Tensor( + data=data, + inds=( + quimb_circuit.bra_site_ind(qubit), + quimb_circuit.ket_site_ind(qubit), + ), + ) + + tn.full_simplify_( + output_inds=(), + seq=simplify_sequence, + atol=simplify_atol, + equalize_norms=simplify_equalize_norms, + ) + return tn + + +def pauli_product_expectation( + quimb_circuit, + factors, + backend, + optimize, + simplify_sequence="ADCRS", + simplify_atol=1e-12, +): + tn = pauli_product_expectation_tn( + quimb_circuit, + factors, + simplify_sequence=simplify_sequence, + simplify_atol=simplify_atol, + ) + return tn.contract(all, output_inds=(), optimize=optimize, backend=backend) + + def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"): super(self.__class__, self).__init__() @@ -340,6 +417,9 @@ def _qibo_circuit_to_quimb( circ.apply_gate("CNOT", c, t) continue if quimb_gate_name is None: + if hasattr(gate, "matrix"): + circ.apply_gate_raw(gate.matrix(), getattr(gate, "qubits", ())) + continue raise_error(ValueError, f"Gate {gate_name} not supported in Quimb backend.") params = getattr(gate, "parameters", ()) @@ -426,20 +506,24 @@ def expectation(self, circuit, observable, parallel=None, parallel_opts=None): exp_val = 0.0 for coeff, factors in all_terms: - op = None - where = [] - for qubit, gate_name in factors: - p = qu.pauli(gate_name.lower()) - op = p if op is None else op & p - where.append(qubit) - - val = qc.local_expectation( - op, tuple(where), - backend=self.backend, - optimize=self.contractions_optimizer, - simplify_sequence="ADCRS", - simplify_atol=1e-12, - ) + if len(factors) > PAULI_DENSE_MAX_QUBITS: + val = pauli_product_expectation( + qc, + factors, + backend=self.backend, + optimize=self.contractions_optimizer, + simplify_sequence="ADCRS", + simplify_atol=1e-12, + ) + else: + op, where = _pauli_term_to_dense_operator(factors) + val = qc.local_expectation( + op, where, + backend=self.backend, + optimize=self.contractions_optimizer, + simplify_sequence="ADCRS", + simplify_atol=1e-12, + ) exp_val += coeff * val return self.real(exp_val) @@ -487,14 +571,11 @@ def _expectation_parallel(self, circuit, observable, method, opts): my_exp = 0.0 for coeff, factors in my_terms: - op = None - where = [] - for qubit, gate_name in factors: - p = qu.pauli(gate_name.lower()) - op = p if op is None else op & p - where.append(qubit) - - tn = qc.local_expectation(op, tuple(where), rehearse='tn') + if len(factors) > PAULI_DENSE_MAX_QUBITS: + tn = pauli_product_expectation_tn(qc, factors) + else: + op, where = _pauli_term_to_dense_operator(factors) + tn = qc.local_expectation(op, where, rehearse='tn') tree = parallel_path_search( tn, tn.outer_inds(), diff --git a/src/qibotn/backends/vidal.py b/src/qibotn/backends/vidal.py new file mode 100644 index 0000000..02c8a93 --- /dev/null +++ b/src/qibotn/backends/vidal.py @@ -0,0 +1,250 @@ +"""Vidal/TEBD fast-path backend with qmatchatea fallback. + +This backend targets MPS-friendly one-dimensional circuits: one-qubit gates and +adjacent two-qubit gates, measured with Pauli-sum expectation values. Unsupported +features fall back to the qmatchatea backend so the public behavior remains +usable while the fast path is expanded. +""" + +from __future__ import annotations + +import re +from dataclasses import dataclass + +import numpy as np +from qibo.backends import NumpyBackend + +from qibotn.backends.abstract import QibotnBackend +from qibotn.backends.qmatchatea import QMatchaTeaBackend +from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor +from qibotn.backends.vidal_tebd import VidalTEBDExecutor, _gate_sites +from qibotn.observables import check_observable + + +def _symbolic_hamiltonian_to_pauli_terms(hamiltonian): + terms = [] + factor_pattern = re.compile(r"([^\d]+)(\d+)") + for term in hamiltonian.terms: + ops = [] + for factor in term.factors: + match = factor_pattern.match(str(factor)) + if match is None: + raise ValueError(f"Unsupported observable factor {factor!r}.") + name = match.group(1).upper() + if name not in ("I", "X", "Y", "Z"): + raise ValueError(f"Unsupported observable operator {name!r}.") + if name != "I": + ops.append((name, int(match.group(2)))) + terms.append((complex(term.coefficient), tuple(ops))) + return terms + + +def _unsupported_reason(circuit): + for gate in circuit.queue: + name = getattr(gate, "name", gate.__class__.__name__) + sites = _gate_sites(gate) + if not sites: + return f"gate {name} has no target qubits" + if len(sites) > 2: + return f"gate {name} acts on {len(sites)} qubits" + if len(sites) == 2 and abs(sites[0] - sites[1]) != 1: + return f"gate {name} is non-adjacent on qubits {sites}" + if not hasattr(gate, "matrix"): + return f"gate {name} does not expose a matrix" + return None + + +def _can_route_non_adjacent(circuit): + """True if the circuit's only unsupported feature is non-adjacent 2Q gates. + + SWAP routing can fix non-adjacent gates at compile time. Multi-qubit + gates and matrix-less gates are truly unsupported. + """ + for gate in circuit.queue: + sites = _gate_sites(gate) + if not sites: + return False + if len(sites) > 2: + return False + if not hasattr(gate, "matrix"): + return False + return True + + +@dataclass +class VidalBackend(QibotnBackend, NumpyBackend): + """QiboTN backend using Vidal/TEBD when possible. + + The fast path supports: + - one-qubit gates with ``gate.matrix()``; + - adjacent two-qubit gates with ``gate.matrix()``; + - Qibo ``SymbolicHamiltonian`` / qibotn dict Pauli-sum expectation values; + - MPI chain segmentation through ``mpi_approach="CT"``. + + Unsupported operations are delegated to qmatchatea. + """ + + def __init__(self): + super().__init__() + self.name = "qibotn" + self.platform = "vidal" + self.precision = "double" + self.last_truncation_error = 0.0 + self.configure_tn_simulation() + + def configure_tn_simulation( + self, + ansatz: str = "MPS", + max_bond_dimension: int = 10, + cut_ratio: float = 1e-9, + trunc_tracking_mode: str = "C", + svd_control: str = "E!", + ini_bond_dimension: int = 1, + tensor_module: str = "torch", + compile_circuit: bool = False, + cache_gate_tensors: bool = True, + track_memory: bool = False, + mpi_approach: str = "SR", + mpi_num_procs: int = 1, + mpi_where_barriers: int = -1, + mpi_isometrization: int = -1, + fallback: bool = True, + ): + self.ansatz = ansatz + self.max_bond_dimension = max_bond_dimension + self.cut_ratio = cut_ratio + self.trunc_tracking_mode = trunc_tracking_mode + self.svd_control = svd_control + self.ini_bond_dimension = ini_bond_dimension + self.tensor_module = tensor_module + self.compile_circuit = compile_circuit + self.cache_gate_tensors = cache_gate_tensors + self.track_memory = track_memory + self.mpi_approach = mpi_approach.upper() + self.mpi_num_procs = mpi_num_procs + self.mpi_where_barriers = mpi_where_barriers + self.mpi_isometrization = mpi_isometrization + self.fallback = fallback + self._fallback_backend = None + + def _setup_backend_specifics(self): + return None + + def _qmatchatea_fallback(self): + if self._fallback_backend is None: + backend = QMatchaTeaBackend() + backend.configure_tn_simulation( + ansatz=self.ansatz, + max_bond_dimension=self.max_bond_dimension, + cut_ratio=self.cut_ratio, + trunc_tracking_mode=self.trunc_tracking_mode, + svd_control=self.svd_control, + ini_bond_dimension=self.ini_bond_dimension, + tensor_module=self.tensor_module, + compile_circuit=self.compile_circuit, + cache_gate_tensors=self.cache_gate_tensors, + track_memory=self.track_memory, + mpi_approach=self.mpi_approach, + mpi_num_procs=self.mpi_num_procs, + mpi_where_barriers=self.mpi_where_barriers, + mpi_isometrization=self.mpi_isometrization, + ) + self._fallback_backend = backend + return self._fallback_backend + + def _fallback_or_raise(self, reason): + if not self.fallback: + raise NotImplementedError(reason) + return self._qmatchatea_fallback() + + def _run_fast_executor(self, circuit, compile_circuit=True): + if self.mpi_approach == "CT": + from mpi4py import MPI + + executor = SegmentVidalMPIExecutor( + nqubits=circuit.nqubits, + max_bond=self.max_bond_dimension, + cut_ratio=self.cut_ratio, + tensor_module=self.tensor_module, + comm=MPI.COMM_WORLD, + ) + else: + executor = VidalTEBDExecutor( + nqubits=circuit.nqubits, + max_bond=self.max_bond_dimension, + cut_ratio=self.cut_ratio, + tensor_module=self.tensor_module, + ) + executor.run_circuit(circuit, compile_circuit=compile_circuit) + return executor + + def expectation(self, circuit, observable, preprocess=True, compile_circuit=None): + if self.ansatz.upper() != "MPS": + backend = self._fallback_or_raise("VidalBackend supports only MPS.") + return backend.expectation(circuit, observable, preprocess, compile_circuit) + + if compile_circuit is None: + compile_circuit = self.compile_circuit + + reason = _unsupported_reason(circuit) + if reason is not None: + # Non-adjacent gates can be routed at compile time + if compile_circuit and _can_route_non_adjacent(circuit): + pass # proceed with Vidal + SWAP routing + else: + backend = self._fallback_or_raise(reason) + return backend.expectation(circuit, observable, preprocess, compile_circuit) + + # preprocess=True requests qmatchatea-style transpilation to MPS topology. + # The fast path validates MPS-compatibility via _unsupported_reason above; + # if the circuit is already MPS-compatible, preprocessing is a no-op. + # If it isn't, _unsupported_reason already routed to fallback. + # So the fast path skips preprocessing — just warn once. + if preprocess: + import warnings + + warnings.warn( + "VidalBackend fast path does not preprocess circuits. " + "If the circuit passes _unsupported_reason (all gates are " + "MPS-compatible), fast-path execution proceeds directly.", + stacklevel=2, + ) + + hamiltonian = check_observable(observable, circuit.nqubits) + try: + terms = _symbolic_hamiltonian_to_pauli_terms(hamiltonian) + except ValueError as exc: + backend = self._fallback_or_raise(str(exc)) + return backend.expectation(circuit, observable, preprocess, compile_circuit) + + executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit) + self.last_truncation_error = float(executor.truncation_error) + if self.mpi_approach == "CT": + value = executor.expectation_pauli_sum_root(terms) + from qtealeaves.tooling.mpisupport import MPI + + if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0: + return np.nan + return np.real(value) + return np.real(executor.expectation_pauli_sum(terms)) + + def execute_circuit( + self, + circuit, + initial_state=None, + nshots=None, + prob_type=None, + return_array=False, + **prob_kwargs, + ): + backend = self._fallback_or_raise( + "VidalBackend.execute_circuit is delegated to qmatchatea." + ) + return backend.execute_circuit( + circuit, + initial_state=initial_state, + nshots=nshots, + prob_type=prob_type, + return_array=return_array, + **prob_kwargs, + ) diff --git a/src/qibotn/backends/vidal_mpi_segment.py b/src/qibotn/backends/vidal_mpi_segment.py new file mode 100644 index 0000000..c0742c6 --- /dev/null +++ b/src/qibotn/backends/vidal_mpi_segment.py @@ -0,0 +1,338 @@ +"""Segmented MPI Vidal/TEBD executor. + +Each rank owns a contiguous interval of sites. Gates fully inside an interval +are applied locally. Only two-site gates crossing a rank boundary communicate +the neighboring edge tensor and the resulting boundary update. +""" + +from __future__ import annotations + +import time +from dataclasses import dataclass + +import numpy as np +from mpi4py import MPI + +from qibotn.backends.vidal_tebd import ( + _asarray, + _backend_module, + _build_theta_svd_matrix, + _disjoint_batches, + _fuse_one_site_blocks, + _gate_sites, + _is_two_qubit_batch, + _make_two_site_update, + _ones, + _route_non_adjacent_gates, + _svd, + _tensor_update_from_numpy, + _tensor_update_to_numpy, + _to_float, + _to_numpy, + _transpose, + VidalTEBDExecutor, +) + +_EDGE_TAG = 1701 +_UPDATE_TAG = 1702 + + +def _partition_sites(nsites, nranks): + base = nsites // nranks + rem = nsites % nranks + starts = [0] + for rank in range(nranks): + starts.append(starts[-1] + base + int(rank < rem)) + return starts + + +@dataclass +class SegmentVidalMPIExecutor: + nqubits: int + max_bond: int + comm: object + cut_ratio: float = 1e-12 + tensor_module: str = "torch" + + def __post_init__(self): + self.rank = self.comm.Get_rank() + self.size = self.comm.Get_size() + self.starts = _partition_sites(self.nqubits, self.size) + self.start = self.starts[self.rank] + self.end = self.starts[self.rank + 1] + if self.start == self.end: + raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.") + + self.xp = _backend_module(self.tensor_module) + if self.xp is np: + self.dtype = np.complex128 + self.device = None + else: + self.dtype = self.xp.complex128 + self.device = self.xp.device("cpu") + + self.gammas = {} + for site in range(self.start, self.end): + self.gammas[site] = _asarray( + self.xp, [[[1.0 + 0.0j], [0.0 + 0.0j]]], self.dtype + ) + + self.lambdas = { + bond: _ones(self.xp, 1, self.dtype, self.device) + for bond in range(self.start, self.end + 1) + } + self._accumulated_truncation_error = 0.0 + + @property + def truncation_error(self): + return self._accumulated_truncation_error + + def owns_site(self, site): + return self.start <= site < self.end + + def owner_of(self, site): + return int(np.searchsorted(self.starts, site, side="right") - 1) + + def run_circuit(self, circuit, compile_circuit=True): + timings = { + "local_compute": 0.0, + "edge_exchange": 0.0, + "boundary_compute": 0.0, + "boundary_update": 0.0, + "one_site": 0.0, + "gather": 0.0, + } + + gates = circuit.queue + if compile_circuit: + gates = _route_non_adjacent_gates(gates, circuit.nqubits) + gates = _fuse_one_site_blocks(gates) + for batch in _disjoint_batches(gates): + if _is_two_qubit_batch(batch): + self._apply_two_site_batch(batch, timings) + else: + tic = time.perf_counter() + for gate in batch: + sites = _gate_sites(gate) + if len(sites) == 1 and self.owns_site(sites[0]): + op = _asarray(self.xp, gate.matrix(), self.dtype) + self.apply_one_site(op, sites[0]) + elif len(sites) == 2: + self._apply_two_site_batch([gate], timings) + elif len(sites) > 2: + raise NotImplementedError("Only one- and two-qubit gates are supported.") + timings["one_site"] += time.perf_counter() - tic + + return timings + + def apply_one_site(self, op, pos): + self.gammas[pos] = self.xp.einsum("st,atb->asb", op, self.gammas[pos]) + + def _apply_two_site_batch(self, batch, timings): + local_gates = [] + boundary_specs = [] + recv_left_update = False + for gate in batch: + sites = _gate_sites(gate) + if abs(sites[0] - sites[1]) != 1: + raise NotImplementedError("Segment Vidal supports adjacent two-qubit gates only.") + left, right = sorted(sites) + left_owner = self.owner_of(left) + right_owner = self.owner_of(right) + if left_owner == self.rank and right_owner == self.rank: + local_gates.append(gate) + elif left_owner == self.rank: + boundary_specs.append((gate, left, right)) + elif right_owner == self.rank: + recv_left_update = True + + tic = time.perf_counter() + edge_send_req = None + if recv_left_update: + edge_send_req = self.comm.isend( + self._edge_payload(), dest=self.rank - 1, tag=_EDGE_TAG + ) + right_edge = ( + self.comm.recv(source=self.rank + 1, tag=_EDGE_TAG) + if boundary_specs + else None + ) + timings["edge_exchange"] += time.perf_counter() - tic + + boundary_update = None + tic = time.perf_counter() + for gate, left, right in boundary_specs: + boundary_update = self._compute_boundary_update( + gate, left, right, right_edge + ) + timings["boundary_compute"] += time.perf_counter() - tic + + tic = time.perf_counter() + update_send_req = None + if boundary_update is not None: + update_send_req = self.comm.isend( + boundary_update, dest=self.rank + 1, tag=_UPDATE_TAG + ) + timings["boundary_update"] += time.perf_counter() - tic + + tic = time.perf_counter() + local_items = [ + self._compute_owned_two_site_update(gate) + for gate in local_gates + ] + timings["local_compute"] += time.perf_counter() - tic + + tic = time.perf_counter() + left_boundary_update = ( + self.comm.recv(source=self.rank - 1, tag=_UPDATE_TAG) + if recv_left_update + else None + ) + if update_send_req is not None: + update_send_req.wait() + if edge_send_req is not None: + edge_send_req.wait() + timings["boundary_update"] += time.perf_counter() - tic + + for update in local_items: + self._install_update(update) + if boundary_update is not None: + self._install_update(boundary_update) + if left_boundary_update is not None: + self._install_update(left_boundary_update) + + def _edge_payload(self): + return { + "start": self.start, + "end": self.end, + "gamma_start": _to_numpy(self.gammas[self.start]), + "lambda_after_start": _to_numpy(self.lambdas[self.start + 1]), + } + + def _compute_owned_two_site_update(self, gate): + sites = _gate_sites(gate) + op = _asarray(self.xp, gate.matrix(), self.dtype) + left, right = sites + if left > right: + left, right = right, left + op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4) + item = self._build_item( + left, + op, + self.lambdas[left], + self.lambdas[left + 1], + self.lambdas[left + 2], + self.gammas[left], + self.gammas[right], + ) + split = _svd(self.xp, item["matrix"]) + return _make_two_site_update( + item, *split, self.max_bond, self.cut_ratio, self.xp + ) + + def _compute_boundary_update(self, gate, left, right, remote): + op = _asarray(self.xp, gate.matrix(), self.dtype) + sites = _gate_sites(gate) + if sites[0] > sites[1]: + op = _transpose(self.xp, op.reshape(2, 2, 2, 2), (1, 0, 3, 2)).reshape(4, 4) + + gamma_right = _asarray(self.xp, remote["gamma_start"], self.dtype) + lam_right = _asarray( + self.xp, + remote["lambda_after_start"], + self.xp.float64 if self.xp is not np else np.float64, + ) + item = self._build_item( + left, + op, + self.lambdas[left], + self.lambdas[left + 1], + lam_right, + self.gammas[left], + gamma_right, + ) + split = _svd(self.xp, item["matrix"]) + return _tensor_update_to_numpy( + _make_two_site_update( + item, *split, self.max_bond, self.cut_ratio, self.xp + ) + ) + + def _build_item(self, site, op, lam_left, lam_mid, lam_right, gamma_left, gamma_right): + result = _build_theta_svd_matrix( + op, self.xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right + ) + result["site"] = site + result["lam_left"] = lam_left + result["lam_right"] = lam_right + return result + + def _install_update(self, update): + if isinstance(update["left"], np.ndarray): + update = _tensor_update_from_numpy(self.xp, update, self.dtype) + self._accumulated_truncation_error += update.get("truncation_error", 0.0) + site = update["site"] + if self.owns_site(site): + self.gammas[site] = update["left"] + if self.owns_site(site + 1): + self.gammas[site + 1] = update["right"] + if self.start <= site + 1 <= self.end: + self.lambdas[site + 1] = update["lambda"] + + def gather_full_state(self): + payload = { + "start": self.start, + "end": self.end, + "gammas": {site: _to_numpy(tensor) for site, tensor in self.gammas.items()}, + "lambdas": {bond: _to_numpy(tensor) for bond, tensor in self.lambdas.items()}, + } + return self.comm.gather(payload, root=0) + + def expectation_pauli_sum_root(self, terms): + payloads = self.gather_full_state() + if self.rank != 0: + return None + full = VidalTEBDExecutor( + self.nqubits, + self.max_bond, + cut_ratio=self.cut_ratio, + tensor_module=self.tensor_module, + ) + for payload in payloads: + for site, tensor in payload["gammas"].items(): + full.gammas[int(site)] = _asarray(self.xp, tensor, self.dtype) + for bond, tensor in payload["lambdas"].items(): + full.lambdas[int(bond)] = _asarray( + self.xp, + tensor, + self.xp.float64 if self.xp is not np else np.float64, + ) + return full.expectation_pauli_sum(terms) + + def expectation_ring_xz_root(self): + terms = [ + (0.5, (("X", site), ("Z", (site + 1) % self.nqubits))) + for site in range(self.nqubits) + ] + return self.expectation_pauli_sum_root(terms) + + +def run_segment_vidal_mpi_ring_xz( + circuit, + max_bond, + comm, + cut_ratio=1e-12, + tensor_module="torch", +): + executor = SegmentVidalMPIExecutor( + nqubits=circuit.nqubits, + max_bond=max_bond, + cut_ratio=cut_ratio, + tensor_module=tensor_module, + comm=comm, + ) + timings = executor.run_circuit(circuit) + tic = time.perf_counter() + value = executor.expectation_ring_xz_root() + timings["gather"] = time.perf_counter() - tic + return value, timings diff --git a/src/qibotn/backends/vidal_tebd.py b/src/qibotn/backends/vidal_tebd.py index fe324a9..e2b2640 100644 --- a/src/qibotn/backends/vidal_tebd.py +++ b/src/qibotn/backends/vidal_tebd.py @@ -62,6 +62,41 @@ def _to_float(x): return float(x) +def _to_numpy(tensor): + if hasattr(tensor, "detach"): + return tensor.detach().cpu().numpy() + return np.asarray(tensor) + + +def _tensor_update_to_numpy(update): + result = { + "site": int(update["site"]), + "left": _to_numpy(update["left"]), + "right": _to_numpy(update["right"]), + "lambda": _to_numpy(update["lambda"]), + } + if "truncation_error" in update: + result["truncation_error"] = float(update["truncation_error"]) + return result + + +def _tensor_update_from_numpy(xp, update, dtype): + if xp is np: + return update + result = { + "site": update["site"], + "left": _asarray(xp, update["left"], dtype), + "right": _asarray(xp, update["right"], dtype), + "lambda": xp.as_tensor( + update["lambda"], + dtype=xp.float64 if dtype == xp.complex128 else xp.float32, + ), + } + if "truncation_error" in update: + result["truncation_error"] = float(update["truncation_error"]) + return result + + def _svd(xp, matrix): return _svd_eigh(xp, matrix) @@ -92,32 +127,6 @@ def _svd_eigh(xp, matrix): return umat, singvals, _conj(xp, eigvecs).T -def _batched_svd_eigh(xp, matrices): - """Batched EVD SVD for a stack of same-shape torch matrices.""" - - m_dim, n_dim = matrices.shape[-2:] - if m_dim <= n_dim: - grams = matrices @ _conj(xp, matrices).transpose(-1, -2) - eigvals, eigvecs = xp.linalg.eigh(grams) - idx = xp.argsort(eigvals, dim=-1, descending=True) - eigvals = xp.gather(eigvals, -1, idx) - eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs)) - singvals = _sqrt_clamped(xp, eigvals) - inv_s = _safe_inverse(xp, singvals) - vhs = (eigvecs.conj().transpose(-1, -2) @ matrices) * inv_s.unsqueeze(-1) - return eigvecs, singvals, vhs - - grams = _conj(xp, matrices).transpose(-1, -2) @ matrices - eigvals, eigvecs = xp.linalg.eigh(grams) - idx = xp.argsort(eigvals, dim=-1, descending=True) - eigvals = xp.gather(eigvals, -1, idx) - eigvecs = xp.gather(eigvecs, -1, idx.unsqueeze(-2).expand_as(eigvecs)) - singvals = _sqrt_clamped(xp, eigvals) - inv_s = _safe_inverse(xp, singvals) - umats = (matrices @ eigvecs) * inv_s.unsqueeze(-2) - return umats, singvals, eigvecs.conj().transpose(-1, -2) - - def _eigh(xp, matrix): if xp is np: return np.linalg.eigh(matrix) @@ -126,10 +135,8 @@ def _eigh(xp, matrix): def _sort_eigh_desc(xp, eigvals, eigvecs): if xp is np: - idx = np.argsort(eigvals)[::-1].copy() - return eigvals[idx], eigvecs[:, idx] - idx = xp.argsort(eigvals, descending=True) - return eigvals[idx], eigvecs[:, idx] + return eigvals[::-1].copy(), eigvecs[:, ::-1].copy() + return xp.flip(eigvals, dims=(0,)), xp.flip(eigvecs, dims=(1,)) def _sqrt_clamped(xp, eigvals): @@ -150,8 +157,6 @@ class VidalTEBDExecutor: max_bond: int cut_ratio: float = 1e-12 tensor_module: str = "torch" - workers: int = 1 - use_batched: bool = False def __post_init__(self): self.xp = _backend_module(self.tensor_module) @@ -169,19 +174,20 @@ class VidalTEBDExecutor: self.lambdas = [ _ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1) ] + self._accumulated_truncation_error = 0.0 - def run_circuit(self, circuit): - for batch in _disjoint_batches(circuit.queue): - if ( - self.use_batched - and _is_two_qubit_batch(batch) - and self.workers > 1 - and len(batch) > 1 - ): - self.apply_two_site_batch(batch) - else: - for gate in batch: - self._apply_gate(gate) + def run_circuit(self, circuit, compile_circuit=True): + gates = circuit.queue + if compile_circuit: + gates = _route_non_adjacent_gates(gates, circuit.nqubits) + gates = _fuse_one_site_blocks(gates) + for batch in _disjoint_batches(gates): + for gate in batch: + self._apply_gate(gate) + + @property + def truncation_error(self): + return self._accumulated_truncation_error def _apply_gate(self, gate): sites = _gate_sites(gate) @@ -204,40 +210,6 @@ class VidalTEBDExecutor: umat, singvals, vh = _svd(self.xp, item["matrix"]) self._install_two_site_split(item, umat, singvals, vh) - def apply_two_site_batch(self, batch): - items = [] - for gate in batch: - sites = _gate_sites(gate) - op = _asarray(self.xp, gate.matrix(), self.dtype) - items.append(self._build_two_site_matrix(op, sites[0], sites[1])) - - if self.xp is not np: - grouped = {} - for idx, item in enumerate(items): - grouped.setdefault(tuple(item["matrix"].shape), []).append(idx) - - for indices in grouped.values(): - if len(indices) == 1: - item = items[indices[0]] - umat, singvals, vh = _svd(self.xp, item["matrix"]) - item["split"] = (umat, singvals, vh) - continue - - matrices = self.xp.stack([items[idx]["matrix"] for idx in indices]) - umats, singvals, vhs = _batched_svd_eigh(self.xp, matrices) - for out_idx, item_idx in enumerate(indices): - items[item_idx]["split"] = ( - umats[out_idx], - singvals[out_idx], - vhs[out_idx], - ) - else: - for item in items: - item["split"] = _svd(self.xp, item["matrix"]) - - for item in items: - self._install_two_site_split(item, *item["split"]) - def _build_two_site_matrix(self, op, left_pos, right_pos): if left_pos > right_pos: left_pos, right_pos = right_pos, left_pos @@ -246,101 +218,69 @@ class VidalTEBDExecutor: ) i = left_pos - lam_left = self.lambdas[i] - lam_mid = self.lambdas[i + 1] - lam_right = self.lambdas[i + 2] - gamma_left = self.gammas[i] - gamma_right = self.gammas[i + 1] - - theta = self.xp.einsum( - "a,asb,b,btc,c->astc", - lam_left, - gamma_left, - lam_mid, - gamma_right, - lam_right, + result = _build_theta_svd_matrix( + op, self.xp, + self.lambdas[i], self.lambdas[i + 1], self.lambdas[i + 2], + self.gammas[i], self.gammas[i + 1], ) - gate = op.reshape(2, 2, 2, 2) - theta = self.xp.einsum("uvst,astc->auvc", gate, theta) - - chi_left = theta.shape[0] - chi_right = theta.shape[3] - matrix = theta.reshape(chi_left * 2, 2 * chi_right) - return { - "site": i, - "chi_left": chi_left, - "chi_right": chi_right, - "lam_left": lam_left, - "lam_right": lam_right, - "matrix": matrix, - } + result["site"] = i + result["lam_left"] = self.lambdas[i] + result["lam_right"] = self.lambdas[i + 2] + return result def _install_two_site_split(self, item, umat, singvals, vh): - keep = self._choose_bond(singvals) - umat = umat[:, :keep] - kept = singvals[:keep] - cut = singvals[keep:] - vh = vh[:keep, :] - - if cut.shape[0] > 0: - norm_kept = (kept * kept).sum() - norm_cut = (cut * cut).sum() - kept = kept / self.xp.sqrt(norm_kept / (norm_kept + norm_cut)) - - new_left = umat.reshape(item["chi_left"], 2, keep) - new_right = vh.reshape(keep, 2, item["chi_right"]) - new_left = self._divide_left_lambda(new_left, item["lam_left"]) - new_right = self._divide_right_lambda(new_right, item["lam_right"]) - - i = item["site"] - self.gammas[i] = new_left - self.gammas[i + 1] = new_right - self.lambdas[i + 1] = kept - - def _choose_bond(self, singvals): - max_possible = int(singvals.shape[0]) - keep = min(max_possible, self.max_bond) - if self.cut_ratio > 0 and max_possible > 0: - threshold = singvals[0] * self.cut_ratio - if self.xp is np: - ratio_keep = int(np.count_nonzero(singvals > threshold)) - else: - ratio_keep = int((singvals > threshold).sum().detach().cpu().item()) - keep = min(keep, max(1, ratio_keep)) - return keep - - def _divide_left_lambda(self, tensor, lambdas): - if self.xp is np: - safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0) - else: - safe = self.xp.where( - self.xp.abs(lambdas) > 1e-300, - lambdas, - self.xp.ones_like(lambdas), - ) - return tensor / safe.reshape(-1, 1, 1) - - def _divide_right_lambda(self, tensor, lambdas): - if self.xp is np: - safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0) - else: - safe = self.xp.where( - self.xp.abs(lambdas) > 1e-300, - lambdas, - self.xp.ones_like(lambdas), - ) - return tensor / safe.reshape(1, 1, -1) + update = _make_two_site_update(item, umat, singvals, vh, + self.max_bond, self.cut_ratio, self.xp) + self._accumulated_truncation_error += update["truncation_error"] + i = update["site"] + self.gammas[i] = update["left"] + self.gammas[i + 1] = update["right"] + self.lambdas[i + 1] = update["lambda"] def expectation_ring_xz(self): - xmat = _asarray(self.xp, [[0, 1], [1, 0]], self.dtype) - zmat = _asarray(self.xp, [[1, 0], [0, -1]], self.dtype) - value = 0.0 - for site in range(self.nqubits - 1): - value += 0.5 * _to_float(self._expect_adjacent(site, xmat, zmat)) - value += 0.5 * _to_float( - self.expect_product_operators({self.nqubits - 1: xmat, 0: zmat}) + return self.expectation_pauli_sum( + [ + (0.5, (("X", site), ("Z", (site + 1) % self.nqubits))) + for site in range(self.nqubits) + ] ) - return value / self.norm() + + def expectation_pauli_sum(self, terms): + paulis = { + "I": _eye(self.xp, 2, self.dtype, self.device), + "X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype), + "Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype), + "Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype), + } + value = 0.0 + norm = self.norm() + for coeff, ops in terms: + ops = tuple((name.upper(), site) for name, site in ops) + if len(ops) == 0: + term_value = norm + elif len(ops) == 1: + name, site = ops[0] + term_value = _to_float(self._expect_one_site(site, paulis[name])) + elif len(ops) == 2 and abs(ops[0][1] - ops[1][1]) == 1: + (name0, site0), (name1, site1) = sorted(ops, key=lambda item: item[1]) + term_value = _to_float( + self._expect_adjacent(site0, paulis[name0], paulis[name1]) + ) + else: + operators = {site: paulis[name] for name, site in ops} + term_value = _to_float(self.expect_product_operators(operators)) + value += float(np.real(coeff)) * term_value + return value / norm + + def _expect_one_site(self, site, op): + theta = self.xp.einsum( + "a,asb,b->asb", + self.lambdas[site], + self.gammas[site], + self.lambdas[site + 1], + ) + op_theta = self.xp.einsum("us,asb->aub", op, theta) + return _vdot_real(self.xp, theta, op_theta) def _expect_adjacent(self, site, op_left, op_right): theta = self.xp.einsum( @@ -369,6 +309,79 @@ class VidalTEBDExecutor: return _to_float(self.expect_product_operators({})) +def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right): + """Merge and apply a two-site gate, returning the SVD-ready matrix.""" + theta = xp.einsum( + "a,asb,b,btc,c->astc", + lam_left, gamma_left, lam_mid, gamma_right, lam_right, + ) + gate = op.reshape(2, 2, 2, 2) + theta = xp.einsum("uvst,astc->auvc", gate, theta) + chi_left = theta.shape[0] + chi_right = theta.shape[3] + return { + "chi_left": chi_left, + "chi_right": chi_right, + "matrix": theta.reshape(chi_left * 2, 2 * chi_right), + } + + +def _choose_bond(singvals, max_bond, cut_ratio, xp): + max_possible = int(singvals.shape[0]) + keep = min(max_possible, max_bond) + if cut_ratio > 0 and max_possible > 0: + threshold = singvals[0] * cut_ratio + if xp is np: + ratio_keep = int(np.count_nonzero(singvals > threshold)) + else: + ratio_keep = int((singvals > threshold).sum().detach().cpu().item()) + keep = min(keep, max(1, ratio_keep)) + return keep + + +def _divide_left_lambda(tensor, lambdas, xp): + if xp is np: + safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0) + else: + safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas)) + return tensor / safe.reshape(-1, 1, 1) + + +def _divide_right_lambda(tensor, lambdas, xp): + if xp is np: + safe = np.where(np.abs(lambdas) > 1e-300, lambdas, 1.0) + else: + safe = xp.where(xp.abs(lambdas) > 1e-300, lambdas, xp.ones_like(lambdas)) + return tensor / safe.reshape(1, 1, -1) + + +def _make_two_site_update(item, umat, singvals, vh, max_bond, cut_ratio, xp): + keep = _choose_bond(singvals, max_bond, cut_ratio, xp) + umat = umat[:, :keep] + kept = singvals[:keep] + cut = singvals[keep:] + vh = vh[:keep, :] + + discarded_weight = 0.0 + if cut.shape[0] > 0: + norm_kept = (kept * kept).sum() + norm_cut = (cut * cut).sum() + discarded_weight = float(_to_float(norm_cut)) + kept = kept / xp.sqrt(norm_kept / (norm_kept + norm_cut)) + + new_left = umat.reshape(item["chi_left"], 2, keep) + new_right = vh.reshape(keep, 2, item["chi_right"]) + new_left = _divide_left_lambda(new_left, item["lam_left"], xp) + new_right = _divide_right_lambda(new_right, item["lam_right"], xp) + return { + "site": item["site"], + "left": new_left, + "right": new_right, + "lambda": kept, + "truncation_error": discarded_weight, + } + + def _gate_sites(gate): controls = tuple(getattr(gate, "control_qubits", ())) targets = tuple(getattr(gate, "target_qubits", ())) @@ -377,19 +390,90 @@ def _gate_sites(gate): return targets +# ── SWAP routing for non-adjacent two-qubit gates ────────────────────── + +class _SWAPGate: + """Minimal SWAP gate wrapper for routing non-adjacent gates.""" + name = "swap" + control_qubits = () + + def __init__(self, left, right): + self.target_qubits = (left, right) + + def matrix(self): + return np.array( + [[1, 0, 0, 0], + [0, 0, 1, 0], + [0, 1, 0, 0], + [0, 0, 0, 1]], + dtype=complex, + ) + + +class _RoutedTwoQubitGate: + """Wraps a two-qubit gate with remapped physical sites after SWAP routing.""" + name = "routed_two_qubit" + control_qubits = () + + def __init__(self, original_gate, left_phys, right_phys): + self.target_qubits = (left_phys, right_phys) + self._matrix = original_gate.matrix() + + def matrix(self): + return self._matrix + + +def _route_non_adjacent_gates(gates, nqubits): + """Insert SWAP networks to make all two-qubit gates adjacent. + + For each non-adjacent two-qubit gate, inserts SWAP gates to bring the + farther qubit adjacent, applies the original gate, then inserts reverse + SWAPs to restore the qubit ordering. The resulting gate sequence + contains only adjacent two-qubit gates and is safe for VidalTEBDExecutor. + """ + routed = [] + for gate in gates: + sites = _gate_sites(gate) + if len(sites) <= 1: + routed.append(gate) + continue + + left, right = sorted(sites) + if right - left == 1: + routed.append(gate) + continue + + # Move qubit 'right' leftwards to sit at left+1 + for pos in range(right - 1, left, -1): + routed.append(_SWAPGate(pos, pos + 1)) + + # Apply the original gate at the adjacent physical positions + routed.append(_RoutedTwoQubitGate(gate, left, left + 1)) + + # Reverse SWAPs to restore original ordering + for pos in range(left + 1, right): + routed.append(_SWAPGate(pos, pos + 1)) + + return routed + + def _disjoint_batches(gates): batches = [] current = [] touched = set() + current_arity = None for gate in gates: sites = _gate_sites(gate) + arity = len(sites) site_set = set(sites) - if current and touched & site_set: + if current and (current_arity != arity or touched & site_set): batches.append(current) current = [] touched = set() + current_arity = None current.append(gate) touched |= site_set + current_arity = arity if current: batches.append(current) return batches @@ -399,21 +483,59 @@ def _is_two_qubit_batch(batch): return batch and all(len(_gate_sites(gate)) == 2 for gate in batch) +class _FusedOneSiteGate: + name = "fused_one_site" + + def __init__(self, site, matrix): + self.target_qubits = (site,) + self.control_qubits = () + self._matrix = matrix + + def matrix(self): + return self._matrix + + +def _fuse_one_site_blocks(gates): + fused = [] + block = [] + + def flush_block(): + nonlocal block + if not block: + return + per_site = {} + for gate in block: + site = _gate_sites(gate)[0] + mat = gate.matrix() + if site in per_site: + per_site[site] = mat @ per_site[site] + else: + per_site[site] = mat + for site in sorted(per_site): + fused.append(_FusedOneSiteGate(site, per_site[site])) + block = [] + + for gate in gates: + if len(_gate_sites(gate)) == 1: + block.append(gate) + continue + flush_block() + fused.append(gate) + flush_block() + return fused + + def run_vidal_ring_xz( circuit, max_bond, cut_ratio=1e-12, tensor_module="torch", - workers=1, - use_batched=False, ): executor = VidalTEBDExecutor( nqubits=circuit.nqubits, max_bond=max_bond, cut_ratio=cut_ratio, tensor_module=tensor_module, - workers=workers, - use_batched=use_batched, ) executor.run_circuit(circuit) return executor.expectation_ring_xz() diff --git a/src/qibotn/benchmark_cases.py b/src/qibotn/benchmark_cases.py new file mode 100644 index 0000000..cee08dc --- /dev/null +++ b/src/qibotn/benchmark_cases.py @@ -0,0 +1,151 @@ +"""Reusable benchmark circuits and observables for expectation runs.""" + +from __future__ import annotations + +import math + +import numpy as np +from qibo import Circuit, gates + + +CIRCUITS = ( + "brickwall_cnot", + "reversed_cnot", + "shifted_cz", + "rxx_rzz", + "swap_scramble", + "ghz_ladder", +) + +OBSERVABLES = ( + "ring_xz", + "open_zz", + "mixed_local", + "range2_xx", + "long_z_string", +) + + +def parse_names(raw, valid, label): + if raw == ["all"]: + return list(valid) + unknown = sorted(set(raw) - set(valid)) + if unknown: + raise ValueError(f"Unknown {label}: {', '.join(unknown)}") + return raw + + +def build_circuit(kind, nqubits, nlayers, seed): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + + if kind == "ghz_ladder": + circuit.add(gates.H(0)) + for qubit in range(nqubits - 1): + circuit.add(gates.CNOT(qubit, qubit + 1)) + return circuit + + for layer in range(nlayers): + for qubit in range(nqubits): + circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) + if kind in ("rxx_rzz", "swap_scramble"): + circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi))) + + if kind == "brickwall_cnot": + add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=False) + elif kind == "reversed_cnot": + add_brickwall(circuit, nqubits, gates.CNOT, layer, reverse=True) + elif kind == "shifted_cz": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CZ(qubit, qubit + 1)) + elif kind == "rxx_rzz": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) + circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7))) + elif kind == "swap_scramble": + for qubit in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CZ(qubit, qubit + 1)) + if layer % 4 == 3: + circuit.add(gates.SWAP(qubit, qubit + 1)) + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + + return circuit + + +def add_brickwall(circuit, nqubits, gate, layer, reverse): + for qubit in range(0, nqubits - 1, 2): + if reverse and layer % 2: + circuit.add(gate(qubit + 1, qubit)) + else: + circuit.add(gate(qubit, qubit + 1)) + for qubit in range(1, nqubits - 1, 2): + if reverse and not layer % 2: + circuit.add(gate(qubit + 1, qubit)) + else: + circuit.add(gate(qubit, qubit + 1)) + + +def observable_terms(kind, nqubits): + if kind == "ring_xz": + return [ + (0.5, (("X", site), ("Z", (site + 1) % nqubits))) + for site in range(nqubits) + ] + if kind == "open_zz": + return [ + (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1))) + for site in range(nqubits - 1) + ] + if kind == "mixed_local": + terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))] + terms += [ + (0.125, (("Y", site), ("Y", site + 1))) + for site in range(0, nqubits - 1, 3) + ] + return terms + if kind == "range2_xx": + return [ + (1.0 / max(1, nqubits - 2), (("X", site), ("X", site + 2))) + for site in range(nqubits - 2) + ] + if kind == "long_z_string": + stride = max(1, nqubits // 16) + return [(1.0, tuple(("Z", site) for site in range(0, nqubits, stride)))] + raise ValueError(f"Unknown observable kind {kind!r}.") + + +def terms_to_dict(terms): + return { + "terms": [ + { + "coefficient": float(np.real(coeff)), + "operators": [(name, int(site)) for name, site in ops], + } + for coeff, ops in terms + ] + } + + +def exact_pauli_sum(circuit, terms, nqubits): + state = circuit().state(numpy=True).reshape(-1) + indices = np.arange(state.size, dtype=np.int64) + value = 0.0 + 0.0j + for coeff, ops in terms: + flipped = indices.copy() + phase = np.ones(state.size, dtype=np.complex128) + for name, site in ops: + shift = nqubits - 1 - site + bit = (indices >> shift) & 1 + if name == "X": + flipped ^= 1 << shift + elif name == "Y": + flipped ^= 1 << shift + phase *= 1j * (1 - 2 * bit) + elif name == "Z": + phase *= 1 - 2 * bit + elif name != "I": + raise ValueError(f"Unsupported Pauli {name!r}.") + value += coeff * np.vdot(state[flipped], phase * state) + return float(value.real) diff --git a/src/qibotn/expectation_runner.py b/src/qibotn/expectation_runner.py new file mode 100644 index 0000000..22c2f55 --- /dev/null +++ b/src/qibotn/expectation_runner.py @@ -0,0 +1,71 @@ +"""High-level CPU expectation runner used by CLI scripts.""" + +from __future__ import annotations + +import time +from dataclasses import dataclass + +import numpy as np +from qibo.backends import construct_backend + +from qibotn.benchmark_cases import exact_pauli_sum +from qibotn.observables import check_observable + + +@dataclass +class ExpectationConfig: + ansatz: str = "tn" + mpi: bool = False + bond: int = 1024 + cut_ratio: float = 1e-12 + tensor_module: str = "torch" + torch_threads: int = 8 + parallel_opts: dict | None = None + + +@dataclass +class ExpectationResult: + value: float + seconds: float + rank: int = 0 + + +def exact_for_observable(circuit, observable, nqubits): + if isinstance(observable, dict) and "terms" in observable: + terms = [ + ( + term["coefficient"], + tuple((name, site) for name, site in term["operators"]), + ) + for term in observable["terms"] + ] + return exact_pauli_sum(circuit, terms, nqubits) + + hamiltonian = check_observable(observable, nqubits) + return float(hamiltonian.expectation_from_state(circuit().state(numpy=True)).real) + + +def run_cpu_expectation(circuit, observable, config): + runcard = { + "MPI_enabled": config.mpi, + "MPS_enabled": config.ansatz.lower() == "mps", + "NCCL_enabled": False, + "expectation_enabled": observable, + "max_bond_dimension": config.bond, + "cut_ratio": config.cut_ratio, + "tensor_module": config.tensor_module, + "torch_threads": config.torch_threads, + "parallel_opts": config.parallel_opts or {}, + } + backend = construct_backend( + backend="qibotn", + platform="cpu", + runcard=runcard, + ) + + start = time.perf_counter() + value = backend.execute_circuit(circuit)[0] + elapsed = time.perf_counter() - start + + rank = getattr(backend, "rank", 0) + return ExpectationResult(float(np.real(value)), elapsed, rank=rank) diff --git a/src/qibotn/observables.py b/src/qibotn/observables.py index 56b3953..5fe3c3b 100644 --- a/src/qibotn/observables.py +++ b/src/qibotn/observables.py @@ -29,6 +29,11 @@ def build_observable(circuit_nqubit): def create_hamiltonian_from_dict(data, circuit_nqubit): """Create a Qibo SymbolicHamiltonian from the qibotn dict representation.""" + if "pauli_string_pattern" in data: + return create_hamiltonian_from_pauli_pattern( + data["pauli_string_pattern"], circuit_nqubit + ) + pauli_gates = {"X": X, "Y": Y, "Z": Z} terms = [] @@ -54,6 +59,38 @@ def create_hamiltonian_from_dict(data, circuit_nqubit): return hamiltonians.SymbolicHamiltonian(sum(terms)) +def create_hamiltonian_from_pauli_pattern(pattern, circuit_nqubit): + """Create a single Pauli-string Hamiltonian by repeating ``pattern``. + + Example: pattern ``"IXZ"`` on 5 qubits becomes ``I0 * X1 * Z2 * I3 * X4``. + Identity factors are omitted except for the all-identity case. + """ + if not isinstance(pattern, str) or not pattern: + raise ValueError("pauli_string_pattern must be a non-empty string.") + + pauli_gates = {"X": X, "Y": Y, "Z": Z} + pattern = pattern.upper() + invalid = sorted(set(pattern) - {"I", "X", "Y", "Z"}) + if invalid: + raise ValueError( + "pauli_string_pattern characters must be one of I/X/Y/Z; " + f"got {''.join(invalid)!r}." + ) + + expr = None + for qubit in range(circuit_nqubit): + name = pattern[qubit % len(pattern)] + if name == "I": + continue + factor = pauli_gates[name](qubit) + expr = factor if expr is None else expr * factor + + if expr is None: + expr = I(0) + + return hamiltonians.SymbolicHamiltonian(form=expr) + + def build_random_circuit(nqubits, nlayers, seed=42): """Build a random circuit with RY+RZ+CNOT layers for benchmarks.""" import numpy as np diff --git a/src/qibotn/parallel.py b/src/qibotn/parallel.py index 5eee1f1..9445c97 100644 --- a/src/qibotn/parallel.py +++ b/src/qibotn/parallel.py @@ -2,8 +2,11 @@ import os import pickle import signal +import time +from math import log2, log10 import numpy as np -from concurrent.futures import ProcessPoolExecutor, as_completed +from dataclasses import dataclass +from concurrent.futures import ProcessPoolExecutor, TimeoutError, as_completed try: from mpi4py import MPI @@ -13,25 +16,59 @@ except ImportError: MPI = None -def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts): +SEARCH_METHODS = ("greedy", "kahypar", "kahypar-agglom", "spinglass") + + +def _search_chunk( + tn_bytes, + output_inds, + repeats, + seed, + max_time, + slicing_opts, + optlib=None, +): import random, cotengra as ctg + random.seed(seed) tn = pickle.loads(tn_bytes) + kwargs = {} + if optlib is not None: + kwargs["optlib"] = optlib opt = ctg.HyperOptimizer( - methods=["kahypar", "kahypar-agglom", "spinglass"], - max_repeats=1, + methods=SEARCH_METHODS, + max_repeats=repeats, + max_time=max_time, parallel=False, minimize="combo-256", - optlib="random", slicing_opts=slicing_opts, progbar=False, + **kwargs, ) tree = tn.contraction_tree(optimize=opt, output_inds=output_inds) return tree.combo_cost(factor=256), tree +def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts): + return _search_chunk( + tn_bytes, + output_inds, + repeats=1, + seed=seed, + max_time=None, + slicing_opts=slicing_opts, + optlib="random", + ) + + def _kill_pool(pool): - for pid in list(pool._processes.keys()): + processes = getattr(pool, "_processes", None) + if processes: + pids = list(processes.keys()) + else: + pids = [] + + for pid in pids: try: os.kill(pid, signal.SIGKILL) except ProcessLookupError: @@ -43,21 +80,14 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts= import time if trial_timeout is None: - import random, cotengra as ctg - random.seed(seed) - tn = pickle.loads(tn_bytes) - opt = ctg.HyperOptimizer( - methods=["kahypar", "kahypar-agglom", "spinglass"], - max_repeats=repeats, - parallel=False, - minimize="combo-256", + return _search_chunk( + tn_bytes, + output_inds, + repeats=repeats, + seed=seed, max_time=max_time, - optlib="random", slicing_opts=slicing_opts, - progbar=False, ) - tree = tn.contraction_tree(optimize=opt, output_inds=output_inds) - return tree.combo_cost(factor=256), tree deadline = time.time() + max_time best_cost, best_tree = float("inf"), None @@ -80,17 +110,36 @@ def _serial_search(tn_bytes, output_inds, repeats, seed, max_time, slicing_opts= return best_cost, best_tree +def _split_repeats(total_repeats, n_workers): + n_workers = max(1, int(n_workers)) + total_repeats = max(1, int(total_repeats)) + chunk, extra = divmod(total_repeats, n_workers) + return [chunk + (1 if i < extra else 0) for i in range(n_workers) if chunk + (1 if i < extra else 0) > 0] + + def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts=None, trial_timeout=None): tn_bytes = pickle.dumps(tn) - repeats_per = max(1, total_repeats // n_workers) - pool = ProcessPoolExecutor(max_workers=n_workers) - futures = [ - pool.submit(_serial_search, tn_bytes, output_inds, repeats_per, seed, max_time, slicing_opts, trial_timeout) - for seed in range(n_workers) - ] + repeat_chunks = _split_repeats(total_repeats, n_workers) + pool = ProcessPoolExecutor(max_workers=len(repeat_chunks)) + futures = [] + for seed, repeats in enumerate(repeat_chunks): + futures.append( + pool.submit( + _serial_search, + tn_bytes, + output_inds, + repeats, + seed, + max_time, + slicing_opts, + trial_timeout, + ) + ) best_cost, best_tree = float("inf"), None + deadline = time.monotonic() + max_time if max_time is not None else None try: - for fut in as_completed(futures, timeout=max_time + 5): + timeout = None if deadline is None else max(0.0, deadline - time.monotonic()) + for fut in as_completed(futures, timeout=timeout): try: cost, tree = fut.result() if cost < best_cost: @@ -106,21 +155,133 @@ def _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, sli return best_tree -def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicing_opts=None, trial_timeout=None): +def _dask_search( + tn, + output_inds, + total_repeats, + max_time, + slicing_opts=None, + dask_address=None, + n_workers=None, + optlib=None, +): + """Run one centralized cotengra hyper-optimizer over a dask pool. + + With ``dask_address`` this connects to an external distributed scheduler. + Without it, a local dask cluster is created for single-node smoke testing. + """ + try: + from distributed import Client, LocalCluster, get_client + except ImportError as exc: + raise ImportError( + "Dask search requires `distributed`. Install it with " + "`pip install distributed` or the package extra that provides it." + ) from exc + + import cotengra as ctg + + close_client = False + close_cluster = False + cluster = None + + if dask_address: + client = Client(dask_address) + close_client = True + else: + try: + client = get_client() + except ValueError: + cluster = LocalCluster( + n_workers=max(1, int(n_workers or os.cpu_count() or 1)), + threads_per_worker=1, + processes=True, + memory_limit=0, + ) + client = Client(cluster) + close_client = True + close_cluster = True + + kwargs = {} + if optlib is not None: + kwargs["optlib"] = optlib + + try: + opt = ctg.HyperOptimizer( + methods=SEARCH_METHODS, + max_repeats=total_repeats, + max_time=max_time, + parallel=client, + minimize="combo-256", + slicing_opts=slicing_opts, + progbar=False, + **kwargs, + ) + return tn.contraction_tree(optimize=opt, output_inds=output_inds) + finally: + if close_client: + client.close() + if close_cluster: + cluster.close() + + +def _mpi_search( + tn, + output_inds, + total_repeats, + max_time, + n_workers=None, + slicing_opts=None, + trial_timeout=None, + search_backend="processpool", + dask_address=None, +): comm = MPI.COMM_WORLD rank, size = comm.Get_rank(), comm.Get_size() - tn_bytes = pickle.dumps(tn) + search_backend = search_backend or "processpool" + + if search_backend == "dask": + if not dask_address: + raise ValueError( + "MPI + dask search requires an external dask scheduler. Start " + "dask-scheduler/dask-worker outside mpiexec and pass " + "`--dask-address tcp://host:8786`." + ) + + payload = None + if rank == 0: + try: + tree = _dask_search( + tn, + output_inds, + total_repeats, + max_time, + slicing_opts=slicing_opts, + dask_address=dask_address, + n_workers=n_workers, + ) + payload = ("ok", tree) + except Exception as exc: + payload = ("error", repr(exc)) + + status, value = comm.bcast(payload, root=0) + if status == "error": + raise RuntimeError(f"Dask path search failed on rank 0: {value}") + return value + repeats_per = max(1, total_repeats // size) - if n_workers and n_workers > 1: - local_tree = _processpool_search( - tn, output_inds, repeats_per, n_workers, max_time, slicing_opts, trial_timeout - ) - local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf") - else: - local_cost, local_tree = _serial_search( - tn_bytes, output_inds, repeats_per, rank, max_time, slicing_opts, trial_timeout - ) + # Run search work in child processes even when n_workers == 1, so the parent + # MPI rank can enforce the global timeout by killing active trials. + local_tree = _processpool_search( + tn, + output_inds, + repeats_per, + max(1, n_workers or 1), + max_time, + slicing_opts, + trial_timeout, + ) + local_cost = local_tree.combo_cost(factor=256) if local_tree else float("inf") all_results = comm.gather((local_cost, local_tree), root=0) best_tree = None @@ -133,11 +294,13 @@ def _mpi_search(tn, output_inds, total_repeats, max_time, n_workers=None, slicin def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024, - max_time=300, n_workers=48, slicing_opts=None, trial_timeout=None): + max_time=300, n_workers=48, slicing_opts=None, + trial_timeout=None, search_backend=None, + dask_address=None): """Parallel contraction path search. Args: - method: 'processpool' | 'mpi' | 'serial' + method: 'processpool' | 'dask' | 'mpi' | 'serial' total_repeats: Total optimization repeats across all workers max_time: Global timeout per worker (seconds) n_workers: Workers per MPI rank (or total for processpool) @@ -151,45 +314,185 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10 elif method == 'mpi': if not _HAVE_MPI: raise ImportError("mpi4py not available") - return _mpi_search(tn, output_inds, total_repeats, max_time, n_workers, slicing_opts, trial_timeout) + return _mpi_search( + tn, + output_inds, + total_repeats, + max_time, + n_workers, + slicing_opts, + trial_timeout, + search_backend=search_backend, + dask_address=dask_address, + ) elif method == 'processpool': return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout) + elif method == 'dask': + return _dask_search( + tn, + output_inds, + total_repeats, + max_time, + slicing_opts=slicing_opts, + dask_address=dask_address, + n_workers=n_workers, + ) else: raise ValueError(f"Unknown method: {method}") -def parallel_contract(tree, arrays, method='mpi', comm=None): +def contraction_tree_costs(tree, dtype_bytes=16, combo_factor=256): + """Return comparable cost estimates for a cotengra contraction tree. + + These values are estimates, not profiling results. They are the right first + signal for path quality: lower ``combo`` usually means lower CPU contraction + time, while ``peak_memory_gib`` estimates the largest intermediate tensor. + """ + stats = tree.contract_stats() + flops = float(stats["flops"]) + write = float(stats["write"]) + size = float(stats["size"]) + combo = float(tree.combo_cost(factor=combo_factor)) + nslices = int(getattr(tree, "multiplicity", 1)) + original_flops = float(stats.get("original_flops", flops)) + + return { + "flops": flops, + "write": write, + "size": size, + "combo": combo, + "log10_flops": log10(flops) if flops > 0 else float("-inf"), + "log10_write": log10(write) if write > 0 else float("-inf"), + "log2_size": log2(size) if size > 0 else float("-inf"), + "log10_combo": log10(combo) if combo > 0 else float("-inf"), + "nslices": nslices, + "slicing_overhead": flops / original_flops if original_flops > 0 else float("nan"), + "peak_memory_gib": size * dtype_bytes / 1024**3, + } + + +@dataclass(frozen=True) +class SlicePlan: + """Slice ownership for one MPI rank.""" + + rank: int + size: int + nslices: int + indices: tuple + assignment: str = "block" + + +@dataclass(frozen=True) +class SlicedContractStats: + """Diagnostics for one sliced contraction.""" + + rank: int + size: int + nslices: int + local_slices: int + assignment: str + + +def mpi_slice_plan(nslices, rank, size, assignment="block"): + """Return the contraction slice ids assigned to one MPI rank. + + ``block`` gives each rank a contiguous range, mirroring cutensornet's + slice-range style. ``cyclic`` gives rank ``r`` slices ``r, r + size, ...``, + which can balance better if individual slice costs vary. + """ + if nslices < 0: + raise ValueError("nslices must be non-negative.") + if size <= 0: + raise ValueError("size must be positive.") + if not 0 <= rank < size: + raise ValueError("rank must satisfy 0 <= rank < size.") + + if assignment == "block": + chunk, extra = divmod(nslices, size) + start = rank * chunk + min(rank, extra) + stop = start + chunk + (1 if rank < extra else 0) + indices = tuple(range(start, stop)) + elif assignment == "cyclic": + indices = tuple(range(rank, nslices, size)) + else: + raise ValueError("assignment must be 'block' or 'cyclic'.") + + return SlicePlan(rank, size, nslices, indices, assignment) + + +def _array_backend(arrays): + return "torch" if type(arrays[0]).__module__.startswith("torch") else "numpy" + + +def _to_numpy_vector(value, is_torch): + if is_torch: + return value.detach().cpu().numpy().reshape(-1) + return np.asarray(value).reshape(-1) + + +def _zero_vector_like(arrays): + return np.zeros(1, dtype=np.complex128) + + +def contract_tree_slices(tree, arrays, slice_indices, backend=None): + """Contract a subset of cotengra slices and return their local sum.""" + backend = backend or _array_backend(arrays) + is_torch = backend == "torch" + local = None + + for slice_id in slice_indices: + value = tree.contract_slice(arrays, slice_id, backend=backend) + value = _to_numpy_vector(value, is_torch) + local = value if local is None else local + value + + return _zero_vector_like(arrays) if local is None else local + + +def parallel_contract( + tree, + arrays, + method='mpi', + comm=None, + assignment="block", + return_stats=False, +): if method == 'mpi': if not _HAVE_MPI or comm is None: raise ValueError("MPI method requires mpi4py and comm") - return _contract_mpi(tree, arrays, comm) + return _contract_mpi( + tree, + arrays, + comm, + assignment=assignment, + return_stats=return_stats, + ) raise ValueError(f"Unknown method: {method}") -def _contract_mpi(tree, arrays, comm, root=0): +def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=False): rank, size = comm.Get_rank(), comm.Get_size() - is_torch = type(arrays[0]).__module__.startswith("torch") + backend = _array_backend(arrays) + is_torch = backend == "torch" + nslices = int(getattr(tree, "multiplicity", 1)) + stats = SlicedContractStats(rank, size, nslices, 0, assignment) - if is_torch: - result_torch = None - for i in range(rank, tree.multiplicity, size): - x = tree.contract_slice(arrays, i, backend="torch").reshape(-1) - result_torch = x if result_torch is None else result_torch + x + if not set(getattr(tree, "sliced_inds", ())).isdisjoint(set(getattr(tree, "output", ()))): + raise NotImplementedError( + "MPI sliced contraction currently requires sliced indices not to " + "appear in the output." + ) - if result_torch is None: - result_np = np.zeros(1, dtype=np.complex128) - else: - result_np = result_torch.detach().cpu().numpy() - else: - result_np = None - for i in range(rank, tree.multiplicity, size): - x = tree.contract_slice(arrays, i) - x_np = np.asarray(x).reshape(-1) - result_np = x_np if result_np is None else result_np + x_np + if nslices <= 1: + if rank != root: + return (None, stats) if return_stats else None + result = tree.contract(arrays, backend=backend) + result = _to_numpy_vector(result, is_torch) + return (result, stats) if return_stats else result - if result_np is None: - result_np = np.zeros(1, dtype=np.complex128) + plan = mpi_slice_plan(nslices, rank, size, assignment=assignment) + local = contract_tree_slices(tree, arrays, plan.indices, backend=backend) + stats = SlicedContractStats(rank, size, nslices, len(plan.indices), assignment) - result = np.zeros_like(result_np) if rank == root else None - comm.Reduce(result_np, result, root=root) - return result + result = np.zeros_like(local) if rank == root else None + comm.Reduce(local, result, root=root) + return (result, stats) if return_stats else result diff --git a/tests/test_cpu_backend.py b/tests/test_cpu_backend.py new file mode 100644 index 0000000..300c6fe --- /dev/null +++ b/tests/test_cpu_backend.py @@ -0,0 +1,130 @@ +import math + +import numpy as np +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Z + +from qibotn.backends.cpu import CpuTensorNet +from qibotn.benchmark_cases import build_circuit as build_benchmark_circuit + + +def build_circuit(nqubits=6): + circuit = Circuit(nqubits) + for qubit in range(nqubits): + circuit.add(gates.RY(qubit, theta=0.1 * (qubit + 1))) + circuit.add(gates.RZ(qubit, theta=-0.05 * (qubit + 1))) + for qubit in range(nqubits - 1): + circuit.add(gates.CNOT(qubit, qubit + 1)) + return circuit + + +def build_observable(nqubits): + form = 0 + for qubit in range(nqubits): + form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits) + return hamiltonians.SymbolicHamiltonian(form=form) + + +def test_cpu_generic_tn_expectation_matches_statevector(): + circuit = build_circuit() + observable = build_observable(circuit.nqubits) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": False, + "NCCL_enabled": False, + "expectation_enabled": observable, + } + ) + value = backend.execute_circuit(circuit)[0] + + assert math.isclose(value, exact, abs_tol=1e-12) + + +def test_cpu_mps_expectation_matches_statevector(): + circuit = build_circuit() + observable = build_observable(circuit.nqubits) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": True, + "NCCL_enabled": False, + "expectation_enabled": observable, + "max_bond_dimension": 64, + "tensor_module": "torch", + "torch_threads": 1, + } + ) + value = backend.execute_circuit(circuit)[0] + + assert math.isclose(value, exact, abs_tol=1e-12) + + +def test_cpu_runcard_pauli_pattern_matches_statevector(): + circuit = build_circuit() + observable = {"pauli_string_pattern": "IXZ"} + exact_hamiltonian = hamiltonians.SymbolicHamiltonian( + form=X(1) * Z(2) * X(4) * Z(5) + ) + exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True)) + + for mps_enabled in (False, True): + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": mps_enabled, + "NCCL_enabled": False, + "expectation_enabled": observable, + "max_bond_dimension": 64, + "tensor_module": "torch", + "torch_threads": 1, + } + ) + value = backend.execute_circuit(circuit)[0] + + assert math.isclose(value, exact, abs_tol=1e-12) + + +def test_cpu_mps_sampling_uses_nshots(): + circuit = Circuit(4) + circuit.add(gates.H(0)) + for qubit in range(3): + circuit.add(gates.CNOT(qubit, qubit + 1)) + + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": True, + "NCCL_enabled": False, + "expectation_enabled": False, + } + ) + result = backend.execute_circuit(circuit, nshots=100) + + assert sum(result.frequencies().values()) == 100 + assert set(result.frequencies()) <= {"0000", "1111"} + + +def test_cpu_generic_tn_long_pauli_string_matches_statevector(): + circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42) + observable = {"pauli_string_pattern": "XZ"} + exact_hamiltonian = hamiltonians.SymbolicHamiltonian( + form=X(0) * Z(1) * X(2) * Z(3) * X(4) * Z(5) * X(6) * Z(7) * X(8) * Z(9) + ) + exact = exact_hamiltonian.expectation_from_state(circuit().state(numpy=True)) + + backend = CpuTensorNet( + { + "MPI_enabled": False, + "MPS_enabled": False, + "NCCL_enabled": False, + "expectation_enabled": observable, + } + ) + value = backend.execute_circuit(circuit)[0] + + assert math.isclose(value, exact, abs_tol=1e-12) diff --git a/tests/test_parallel.py b/tests/test_parallel.py new file mode 100644 index 0000000..5decf7f --- /dev/null +++ b/tests/test_parallel.py @@ -0,0 +1,46 @@ +import numpy as np + +from qibotn.parallel import _split_repeats, contract_tree_slices, mpi_slice_plan + + +def test_mpi_slice_plan_block_balances_contiguous_ranges(): + plans = [mpi_slice_plan(10, rank, 4, assignment="block") for rank in range(4)] + + assert [plan.indices for plan in plans] == [ + (0, 1, 2), + (3, 4, 5), + (6, 7), + (8, 9), + ] + + +def test_mpi_slice_plan_cyclic_balances_round_robin(): + plans = [mpi_slice_plan(10, rank, 4, assignment="cyclic") for rank in range(4)] + + assert [plan.indices for plan in plans] == [ + (0, 4, 8), + (1, 5, 9), + (2, 6), + (3, 7), + ] + + +class DummyTree: + def contract_slice(self, arrays, i, backend=None): + return arrays[0] * (i + 1) + + +def test_contract_tree_slices_sums_numpy_slices(): + result = contract_tree_slices( + DummyTree(), + [np.asarray([2.0 + 0.0j])], + (0, 2, 3), + backend="numpy", + ) + + np.testing.assert_allclose(result, np.asarray([16.0 + 0.0j])) + + +def test_split_repeats_balances_workers(): + assert _split_repeats(10, 4) == [3, 3, 2, 2] + assert _split_repeats(2, 4) == [1, 1] diff --git a/tests/test_vidal_backend.py b/tests/test_vidal_backend.py new file mode 100644 index 0000000..149cd0c --- /dev/null +++ b/tests/test_vidal_backend.py @@ -0,0 +1,167 @@ +import math + +import numpy as np +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Y, Z + +from qibotn.backends.vidal import VidalBackend, _can_route_non_adjacent, _unsupported_reason +from qibotn.backends.vidal_tebd import ( + _route_non_adjacent_gates, + _gate_sites, +) + + +def build_local_circuit(nqubits=8, nlayers=3, seed=42): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + for layer in range(nlayers): + for q in range(nqubits): + circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CNOT(q, q + 1)) + return circuit + + +def test_vidal_backend_expectation_matches_statevector(): + circuit = build_local_circuit() + observable = hamiltonians.SymbolicHamiltonian( + form=0.5 * X(0) * Z(1) + 0.25 * Y(2) * Y(3) - 0.7 * Z(7) + ) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation(max_bond_dimension=128, tensor_module="torch") + value = backend.expectation(circuit, observable) + + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_backend_fallback_for_non_adjacent_gate(): + """compile_circuit=False (default) → falls back to qmatchatea for non-adjacent.""" + circuit = Circuit(4) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(0, 3)) + observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3)) + + backend = VidalBackend() + backend.configure_tn_simulation(max_bond_dimension=32, tensor_module="torch") + value = backend.expectation(circuit, observable) + + exact = observable.expectation_from_state(circuit().state(numpy=True)) + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_vidal_backend_routes_non_adjacent_with_compile(): + """Non-adjacent gate with compile_circuit=True goes through Vidal SWAP routing.""" + circuit = Circuit(4) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(0, 3)) + + observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=32, tensor_module="torch", compile_circuit=True, + ) + value = backend.expectation(circuit, observable) + + exact = observable.expectation_from_state(circuit().state(numpy=True)) + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_can_route_non_adjacent(): + """_can_route_non_adjacent correctly identifies routable circuits.""" + circuit = Circuit(4) + circuit.add(gates.H(0)) + circuit.add(gates.CNOT(0, 3)) + assert _can_route_non_adjacent(circuit) + + circuit.add(gates.CNOT(0, 1)) + assert _can_route_non_adjacent(circuit) + + +def test_cannot_route_multi_qubit(): + """Circuits with 3+ qubit gates cannot be routed.""" + circuit = Circuit(3) + circuit.add(gates.TOFFOLI(0, 1, 2)) + assert not _can_route_non_adjacent(circuit) + + +def test_routing_preserves_adjacent_gates(): + """_route_non_adjacent_gates leaves adjacent gates unchanged.""" + circuit = build_local_circuit(nqubits=4, nlayers=2) + original = list(circuit.queue) + routed = _route_non_adjacent_gates(original, 4) + + # Count 2Q gates — should be more due to inserted SWAPs, so just + # check that all 2-site gates ARE adjacent. + for gate in routed: + sites = _gate_sites(gate) + if len(sites) == 2: + diff = abs(sites[0] - sites[1]) + assert diff == 1, f"Non-adjacent gate after routing: {gate.name} on {sites}" + + +def test_routing_non_adjacent_cnot(): + """Manually verify SWAP+CNOT+unSWAP for CNOT(0,3).""" + circuit = Circuit(4) + circuit.add(gates.H(0)) + circuit.add(gates.H(3)) + circuit.add(gates.CNOT(0, 3)) + + routed = _route_non_adjacent_gates(list(circuit.queue), 4) + + # Expected: H(0), H(3), SWAP(2,3), SWAP(1,2), routed CNOT on (0,1), SWAP(1,2), SWAP(2,3) + names = [getattr(g, "name", g.__class__.__name__) for g in routed] + assert names == ["h", "h", "swap", "swap", "routed_two_qubit", "swap", "swap"], f"Got {names}" + + # Verify expectation through full pipeline + observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3)) + exact = observable.expectation_from_state(circuit().state(numpy=True)) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=32, tensor_module="torch", compile_circuit=True, + ) + value = backend.expectation(circuit, observable) + np.testing.assert_allclose(value, exact, atol=1e-12) + + +def test_truncation_error_no_truncation(): + """With large bond, truncation error should be essentially zero.""" + circuit = build_local_circuit(nqubits=6, nlayers=2) + observable = hamiltonians.SymbolicHamiltonian(form=0.5 * X(0) * Z(1)) + + backend = VidalBackend() + backend.configure_tn_simulation(max_bond_dimension=256, tensor_module="torch") + value = backend.expectation(circuit, observable) + _ = value # ensure computation runs + + assert backend.last_truncation_error < 1e-14, ( + f"Expected near-zero truncation error, got {backend.last_truncation_error}" + ) + + +def test_vidal_backend_matches_statevector_multiterm(): + """Multi-term observable with non-adjacent gates, compile_circuit=True.""" + circuit = Circuit(5) + for q in range(5): + circuit.add(gates.RY(q, theta=0.7)) + circuit.add(gates.RZ(q, theta=0.3)) + circuit.add(gates.CNOT(0, 2)) + circuit.add(gates.CNOT(1, 4)) + + observable = hamiltonians.SymbolicHamiltonian( + form=(0.3 * X(0) * Z(2) + 0.7 * Y(1) * Y(4) - 0.5 * Z(0) * X(4)) + ) + + exact_state = circuit().state(numpy=True) + exact = observable.expectation_from_state(exact_state) + + backend = VidalBackend() + backend.configure_tn_simulation( + max_bond_dimension=64, tensor_module="torch", compile_circuit=True, + ) + value = backend.expectation(circuit, observable) + np.testing.assert_allclose(value, exact, atol=1e-10) diff --git a/tools/README.md b/tools/README.md new file mode 100644 index 0000000..db62da6 --- /dev/null +++ b/tools/README.md @@ -0,0 +1,18 @@ +# Tools + +Auxiliary scripts for profiling, legacy comparisons, and scale probes. + +The main CPU expectation entrypoint is `../benchmark_cpu_expectation.py`. +For the current Vidal/MPS 1D-chain tests, prefer `../run_vidal_mps_cases.sh`. + +Files here are intentionally secondary: + +- `compare_vidal_backend_qmatchatea.py`: diagnostic comparison against QMatchaTea. +- `profile_vidal_chrome.py`: PyTorch CPU profiler for the Vidal path. +- `run_cpu_single_cases.sh`: single-node scale probes. +- `run_cpu_large_cases.sh`: two-node MPI scale probes. +- `run_vidal_segment_mpi_scan.sh`: rank/thread scaling scan for Vidal segmented MPI. +- `baseline_mps_expectation.py`: legacy MPS comparison CLI kept for old commands. +- `benchmark_tn_mpi.py`, `benchmark_search.py`, `benchmark_slice.py`, `benchmark_contract_sliced.py`, `check_tree.py`: old TN path-search/slicing experiments. +- `qibojit_reference_expectation.py`: state-vector reference helper. +- `validate_vidal_mpi_correctness.py`: focused Vidal MPI correctness helper. diff --git a/baseline_mps_expectation.py b/tools/baseline_mps_expectation.py similarity index 54% rename from baseline_mps_expectation.py rename to tools/baseline_mps_expectation.py index 8179a31..311cef7 100644 --- a/baseline_mps_expectation.py +++ b/tools/baseline_mps_expectation.py @@ -1,62 +1,34 @@ -"""Baseline MPS expectation scan with the qmatchatea backend.""" +"""MPS expectation benchmark for qmatchatea and Vidal backends.""" import argparse import json import logging -import math +import os +import socket import time import numpy as np -from qibo import Circuit, gates, hamiltonians -from qibo.symbols import X, Z +from qibotn.benchmark_cases import ( + build_circuit as build_benchmark_circuit, + exact_pauli_sum, + observable_terms, + terms_to_dict, +) from qibotn.backends.qmatchatea import QMatchaTeaBackend from qibotn.backends.vidal_tebd import run_vidal_ring_xz def build_circuit(nqubits, nlayers, seed): - import numpy as np - - rng = np.random.default_rng(seed) - circuit = Circuit(nqubits) - for _ in range(nlayers): - for qubit in range(nqubits): - circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi))) - circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi))) - for qubit in range(0, nqubits - 1, 2): - circuit.add(gates.CNOT(qubit, qubit + 1)) - for qubit in range(1, nqubits - 1, 2): - circuit.add(gates.CNOT(qubit, qubit + 1)) - return circuit + return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed) def build_observable(nqubits): - form = 0 - for qubit in range(nqubits): - form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits) - return hamiltonians.SymbolicHamiltonian(form=form) + return terms_to_dict(observable_terms("ring_xz", nqubits)) def exact_expectation(circuit, nqubits): - import numpy as np - - state = circuit().state(numpy=True).reshape(-1) - - value = 0.0 - chunk_size = 1 << 20 - for qubit in range(nqubits): - next_qubit = (qubit + 1) % nqubits - x_flip = 1 << (nqubits - 1 - qubit) - z_shift = nqubits - 1 - next_qubit - term = 0.0 - for start in range(0, state.size, chunk_size): - stop = min(start + chunk_size, state.size) - indices = np.arange(start, stop, dtype=np.int64) - z_bit = (indices >> z_shift) & 1 - z_phase = 1 - 2 * z_bit - term += np.vdot(state[indices ^ x_flip], z_phase * state[start:stop]).real - value += 0.5 * term - return float(value) + return exact_pauli_sum(circuit, observable_terms("ring_xz", nqubits), nqubits) def main(): @@ -68,16 +40,21 @@ def main(): parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch") parser.add_argument("--torch-threads", type=int, default=32) parser.add_argument( - "--executor", choices=("qmatchatea", "vidal"), default="qmatchatea" + "--executor", + choices=("qmatchatea", "vidal", "vidal-mpi"), + default="qmatchatea", ) - parser.add_argument("--vidal-workers", type=int, default=1) - parser.add_argument("--vidal-batched", action="store_true") parser.add_argument("--mpi-ct", action="store_true") parser.add_argument("--mpi-barriers", type=int, default=-1) parser.add_argument("--mpi-isometrization", type=int, default=-1) parser.add_argument("--exact", action="store_true") parser.add_argument("--exact-max-qubits", type=int, default=24) parser.add_argument("--reference-file") + parser.add_argument( + "--mpi-rank-map", + action="store_true", + help="Print MPI rank, host, pid, and torch thread placement metadata.", + ) args = parser.parse_args() logging.getLogger("qibo.config").setLevel(logging.ERROR) logging.getLogger("qtealeaves").setLevel(logging.ERROR) @@ -91,6 +68,26 @@ def main(): rank = MPI.COMM_WORLD.Get_rank() size = MPI.COMM_WORLD.Get_size() + if args.mpi_rank_map: + rank_info = { + "rank": rank, + "size": size, + "host": socket.gethostname(), + "pid": os.getpid(), + "torch_threads": args.torch_threads, + "omp_num_threads": os.environ.get("OMP_NUM_THREADS", ""), + "mkl_num_threads": os.environ.get("MKL_NUM_THREADS", ""), + } + rank_infos = MPI.COMM_WORLD.gather(rank_info, root=0) + if rank == 0: + print("mpi_rank_map") + for item in sorted(rank_infos, key=lambda row: row["rank"]): + print( + "rank={rank} size={size} host={host} pid={pid} " + "torch_threads={torch_threads} " + "OMP_NUM_THREADS={omp_num_threads} " + "MKL_NUM_THREADS={mkl_num_threads}".format(**item) + ) circuit = build_circuit(args.nqubits, args.nlayers, args.seed) observable = build_observable(args.nqubits) @@ -106,30 +103,42 @@ def main(): exact = exact_expectation(circuit, args.nqubits) if rank == 0: - mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR" + if args.mpi_ct and args.executor in ("vidal", "vidal-mpi"): + mpi_label = f"VidalSegment/{size}" + else: + mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR" print( f"nqubits={args.nqubits} nlayers={args.nlayers} " f"bond={args.bond} seed={args.seed} " f"tensor_module={args.tensor_module} svd_control=E! " - f"compile_circuit=True mpi={mpi_label} executor={args.executor} " - f"vidal_workers={args.vidal_workers} vidal_batched={args.vidal_batched}" + f"compile_circuit=True mpi={mpi_label} executor={args.executor}" ) if exact is not None: print(f"exact={exact:.16e}") print("expval abs_error rel_error seconds") start = time.perf_counter() - if args.executor == "vidal": + timings = None + if args.executor in ("vidal", "vidal-mpi"): + if args.executor == "vidal-mpi" and not args.mpi_ct: + raise ValueError("--executor vidal-mpi requires --mpi-ct.") if args.mpi_ct: - raise ValueError("--executor vidal is a single-process executor.") - value = run_vidal_ring_xz( - circuit, - max_bond=args.bond, - cut_ratio=1e-12, - tensor_module=args.tensor_module, - workers=args.vidal_workers, - use_batched=args.vidal_batched, - ) + from qibotn.backends.vidal_mpi_segment import run_segment_vidal_mpi_ring_xz + + value, timings = run_segment_vidal_mpi_ring_xz( + circuit, + max_bond=args.bond, + cut_ratio=1e-12, + tensor_module=args.tensor_module, + comm=MPI.COMM_WORLD, + ) + else: + value = run_vidal_ring_xz( + circuit, + max_bond=args.bond, + cut_ratio=1e-12, + tensor_module=args.tensor_module, + ) else: backend = QMatchaTeaBackend() backend.configure_tn_simulation( @@ -151,6 +160,12 @@ def main(): preprocess=False, compile_circuit=True, ) + max_timings = None + if timings: + max_timings = { + key: MPI.COMM_WORLD.reduce(local_value, op=MPI.MAX, root=0) + for key, local_value in timings.items() + } if rank != 0: return value = float(np.real(value)) @@ -158,6 +173,10 @@ def main(): abs_error = float("nan") if exact is None else abs(value - exact) rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15) print(f"{value:.16e} {abs_error:.6e} {rel_error:.6e} {elapsed:.3f}") + if max_timings: + print("timing_section max_seconds") + for key, max_value in max_timings.items(): + print(f"{key} {max_value:.6f}") if __name__ == "__main__": diff --git a/benchmark_contract_sliced.py b/tools/benchmark_contract_sliced.py similarity index 100% rename from benchmark_contract_sliced.py rename to tools/benchmark_contract_sliced.py diff --git a/benchmark_search.py b/tools/benchmark_search.py similarity index 100% rename from benchmark_search.py rename to tools/benchmark_search.py diff --git a/benchmark_slice.py b/tools/benchmark_slice.py similarity index 100% rename from benchmark_slice.py rename to tools/benchmark_slice.py diff --git a/benchmark_tn_mpi.py b/tools/benchmark_tn_mpi.py similarity index 100% rename from benchmark_tn_mpi.py rename to tools/benchmark_tn_mpi.py diff --git a/check_tree.py b/tools/check_tree.py similarity index 100% rename from check_tree.py rename to tools/check_tree.py diff --git a/tools/compare_vidal_backend_qmatchatea.py b/tools/compare_vidal_backend_qmatchatea.py new file mode 100644 index 0000000..b5050cf --- /dev/null +++ b/tools/compare_vidal_backend_qmatchatea.py @@ -0,0 +1,137 @@ +"""Compare QMatchaTeaBackend with the VidalBackend fast path.""" + +from __future__ import annotations + +import argparse +import json +import math +import time + +import numpy as np +import torch +from qibo import Circuit, gates, hamiltonians +from qibo.symbols import X, Y, Z + +from qibotn.backends.qmatchatea import QMatchaTeaBackend +from qibotn.backends.vidal import VidalBackend + + +def build_circuit(nqubits, nlayers, seed, kind): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + for layer in range(nlayers): + for q in range(nqubits): + circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) + if kind == "brickwall": + for q in range(0, nqubits - 1, 2): + circuit.add(gates.CNOT(q, q + 1)) + for q in range(1, nqubits - 1, 2): + circuit.add(gates.CNOT(q, q + 1)) + elif kind == "shifted-cz": + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CZ(q, q + 1)) + elif kind == "reversed-cnot": + for q in range(0, nqubits - 1, 2): + circuit.add(gates.CNOT(q + 1, q)) + for q in range(1, nqubits - 1, 2): + circuit.add(gates.CNOT(q, q + 1)) + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + return circuit + + +def build_observable(nqubits, kind): + form = 0 + if kind == "ring-xz": + for q in range(nqubits): + form += 0.5 * X(q) * Z((q + 1) % nqubits) + elif kind == "open-zz": + for q in range(nqubits - 1): + form += Z(q) * Z(q + 1) / (nqubits - 1) + elif kind == "mixed": + form += 0.25 * X(0) - 0.5 * Z(nqubits - 1) + for q in range(0, nqubits - 1, 3): + form += 0.125 * Y(q) * Y(q + 1) + else: + raise ValueError(f"Unknown observable kind {kind!r}.") + return hamiltonians.SymbolicHamiltonian(form=form) + + +def run_backend(backend, circuit, observable): + start = time.perf_counter() + value = backend.expectation(circuit, observable, preprocess=False, compile_circuit=True) + return float(np.real(value)), time.perf_counter() - start + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--nqubits", type=int, default=34) + parser.add_argument("--nlayers", type=int, default=20) + parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch") + parser.add_argument("--torch-threads", type=int, default=32) + parser.add_argument( + "--circuit-kind", + choices=("brickwall", "shifted-cz", "reversed-cnot"), + default="brickwall", + ) + parser.add_argument( + "--observable-kind", + choices=("ring-xz", "open-zz", "mixed"), + default="ring-xz", + ) + parser.add_argument("--reference-file") + parser.add_argument("--skip-qmatchatea", action="store_true") + args = parser.parse_args() + + torch.set_num_threads(args.torch_threads) + circuit = build_circuit(args.nqubits, args.nlayers, args.seed, args.circuit_kind) + observable = build_observable(args.nqubits, args.observable_kind) + + exact = None + if args.reference_file: + with open(args.reference_file, "r", encoding="utf-8") as f: + exact = float(json.load(f)["expectation"]) + + print( + f"nqubits={args.nqubits} nlayers={args.nlayers} bond={args.bond} " + f"circuit={args.circuit_kind} observable={args.observable_kind} " + f"tensor_module={args.tensor_module} torch_threads={args.torch_threads}" + ) + if exact is not None: + print(f"exact={exact:.16e}") + print("backend value abs_error seconds") + + if not args.skip_qmatchatea: + qmt = QMatchaTeaBackend() + qmt.configure_tn_simulation( + ansatz="MPS", + max_bond_dimension=args.bond, + cut_ratio=1e-12, + svd_control="E!", + tensor_module=args.tensor_module, + compile_circuit=True, + track_memory=False, + ) + value, seconds = run_backend(qmt, circuit, observable) + error = float("nan") if exact is None else abs(value - exact) + print(f"qmatchatea {value:.16e} {error:.6e} {seconds:.3f}") + + vidal = VidalBackend() + vidal.configure_tn_simulation( + ansatz="MPS", + max_bond_dimension=args.bond, + cut_ratio=1e-12, + tensor_module=args.tensor_module, + compile_circuit=True, + fallback=True, + ) + value, seconds = run_backend(vidal, circuit, observable) + error = float("nan") if exact is None else abs(value - exact) + print(f"vidal {value:.16e} {error:.6e} {seconds:.3f}") + + +if __name__ == "__main__": + main() diff --git a/tools/profile_vidal_chrome.py b/tools/profile_vidal_chrome.py new file mode 100644 index 0000000..bf22276 --- /dev/null +++ b/tools/profile_vidal_chrome.py @@ -0,0 +1,72 @@ +"""Chrome trace profiler for the VidalBackend fast path.""" + +from __future__ import annotations + +import argparse +from pathlib import Path + +import torch +from torch.profiler import ProfilerActivity, profile + +from qibotn.benchmark_cases import build_circuit, terms_to_dict, observable_terms +from qibotn.expectation_runner import ExpectationConfig, run_cpu_expectation + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--nqubits", type=int, default=34) + parser.add_argument("--nlayers", type=int, default=20) + parser.add_argument("--bond", type=int, default=512) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--torch-threads", type=int, default=32) + parser.add_argument("--cut-ratio", type=float, default=1e-12) + parser.add_argument("--profile-memory", action="store_true") + parser.add_argument("--rows", type=int, default=60) + args = parser.parse_args() + + torch.set_num_threads(args.torch_threads) + + prefix = f"profiles/vidal_n{args.nqubits}_l{args.nlayers}_b{args.bond}_t{args.torch_threads}" + trace_path = Path(f"{prefix}.json") + table_path = Path(f"{prefix}.txt") + trace_path.parent.mkdir(parents=True, exist_ok=True) + + circuit = build_circuit("brickwall_cnot", args.nqubits, args.nlayers, args.seed) + observable = terms_to_dict(observable_terms("ring_xz", args.nqubits)) + config = ExpectationConfig( + ansatz="mps", + bond=args.bond, + cut_ratio=args.cut_ratio, + tensor_module="torch", + torch_threads=args.torch_threads, + ) + + print( + f"profile vidal nqubits={args.nqubits} nlayers={args.nlayers} " + f"bond={args.bond} threads={args.torch_threads}" + ) + + with profile( + activities=[ProfilerActivity.CPU], + record_shapes=args.profile_memory, + profile_memory=args.profile_memory, + with_stack=args.profile_memory, + ) as prof: + result = run_cpu_expectation(circuit, observable, config) + + table = ( + f"expval={result.value:.16e}\n\n" + f"# sorted by self_cpu_time_total\n" + f"{prof.key_averages().table(sort_by='self_cpu_time_total', row_limit=args.rows)}\n\n" + f"# sorted by cpu_time_total\n" + f"{prof.key_averages().table(sort_by='cpu_time_total', row_limit=args.rows)}\n" + ) + + print(table, end="") + table_path.write_text(table, encoding="utf-8") + prof.export_chrome_trace(str(trace_path)) + print(f"trace={trace_path}\ntable={table_path}") + + +if __name__ == "__main__": + main() diff --git a/qibojit_reference_expectation.py b/tools/qibojit_reference_expectation.py similarity index 100% rename from qibojit_reference_expectation.py rename to tools/qibojit_reference_expectation.py diff --git a/tools/run_cpu_large_cases.sh b/tools/run_cpu_large_cases.sh new file mode 100755 index 0000000..ba02363 --- /dev/null +++ b/tools/run_cpu_large_cases.sh @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Large CPU expectation benchmarks for two-server runs. +# +# Defaults assume two Intel Xeon Platinum 8558P servers with about 500 GiB RAM +# each. Override HOSTFILE, PYTHON_BIN, MPIEXEC, or the per-case knobs below as +# needed. + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" +MPIEXEC="${MPIEXEC:-mpiexec}" +HOSTFILE="${HOSTFILE:-hostfile}" + +MPS_RANKS="${MPS_RANKS:-8}" +MPS_THREADS="${MPS_THREADS:-12}" +TN_RANKS="${TN_RANKS:-12}" +TN_THREADS="${TN_THREADS:-8}" + +export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}" +export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}" + +run_mpi() { + local ranks="$1" + shift + "$MPIEXEC" -hostfile "$HOSTFILE" -n "$ranks" "$PYTHON_BIN" "$@" +} + +run_case() { + local title="$1" + shift + echo + echo "================================================================================" + echo "$title" + echo "================================================================================" + echo "HOSTFILE=$HOSTFILE PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC" + echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS" + echo "$*" + "$@" +} + +case "${1:-help}" in + smoke) + run_case "MPS MPI smoke: n=40 layers=30 bond=2048" \ + run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \ + --mpi --mps \ + --nqubits "${MPS_SMOKE_NQ:-40}" \ + --nlayers "${MPS_SMOKE_LAYERS:-30}" \ + --bond "${MPS_SMOKE_BOND:-2048}" \ + --torch-threads "$MPS_THREADS" \ + --circuits brickwall_cnot reversed_cnot shifted_cz \ + --observables ring_xz open_zz range2_xx + + run_case "TN MPI smoke: n=32 layers=16 target_slices=12" \ + run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \ + --mpi \ + --nqubits "${TN_SMOKE_NQ:-32}" \ + --nlayers "${TN_SMOKE_LAYERS:-16}" \ + --torch-threads "$TN_THREADS" \ + --circuits brickwall_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz range2_xx \ + --tn-target-slices "${TN_SMOKE_SLICES:-12}" + ;; + + mps-long) + run_case "MPS MPI long: n=64 layers=48 bond=4096" \ + run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \ + --mpi --mps \ + --nqubits "${MPS_LONG_NQ:-64}" \ + --nlayers "${MPS_LONG_LAYERS:-48}" \ + --bond "${MPS_LONG_BOND:-4096}" \ + --torch-threads "$MPS_THREADS" \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz mixed_local range2_xx + ;; + + mps-pressure) + run_case "MPS MPI pressure: n=80 layers=64 bond=4096" \ + run_mpi "$MPS_RANKS" benchmark_cpu_expectation.py \ + --mpi --mps \ + --nqubits "${MPS_PRESSURE_NQ:-80}" \ + --nlayers "${MPS_PRESSURE_LAYERS:-64}" \ + --bond "${MPS_PRESSURE_BOND:-4096}" \ + --torch-threads "$MPS_THREADS" \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz swap_scramble \ + --observables ring_xz open_zz mixed_local range2_xx long_z_string + ;; + + tn-long) + run_case "TN MPI long: n=36 layers=20 target_slices=24" \ + run_mpi "$TN_RANKS" benchmark_cpu_expectation.py \ + --mpi \ + --nqubits "${TN_LONG_NQ:-36}" \ + --nlayers "${TN_LONG_LAYERS:-20}" \ + --torch-threads "$TN_THREADS" \ + --circuits brickwall_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz range2_xx \ + --tn-target-slices "${TN_LONG_SLICES:-24}" + ;; + + all) + "$0" smoke + "$0" mps-long + "$0" tn-long + ;; + + help|*) + cat >&2 <<'EOF' +Usage: tools/run_cpu_large_cases.sh [smoke|mps-long|mps-pressure|tn-long|all] + +Common overrides: + HOSTFILE=hostfile + PYTHON_BIN=.venv/bin/python + MPIEXEC=mpiexec + MPS_RANKS=8 MPS_THREADS=12 + TN_RANKS=12 TN_THREADS=8 + +Scale overrides: + MPS_LONG_NQ=64 MPS_LONG_LAYERS=48 MPS_LONG_BOND=4096 + MPS_PRESSURE_NQ=80 MPS_PRESSURE_LAYERS=64 MPS_PRESSURE_BOND=4096 + TN_LONG_NQ=36 TN_LONG_LAYERS=20 TN_LONG_SLICES=24 +EOF + exit 2 + ;; +esac diff --git a/tools/run_cpu_single_cases.sh b/tools/run_cpu_single_cases.sh new file mode 100755 index 0000000..720dbc9 --- /dev/null +++ b/tools/run_cpu_single_cases.sh @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Single-node CPU scale probes for expectation benchmarks. +# +# Intended for one 96-core / ~500 GiB RAM node. The default "probe" mode runs +# moderate MPS and TN cases first. Larger modes are available after checking +# runtime and memory from the probe output. + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" +PYTHON_FLAGS="${PYTHON_FLAGS:--u}" +MPIEXEC="${MPIEXEC:-mpiexec}" +TIME_BIN="${TIME_BIN:-/usr/bin/time}" + +MPS_RANKS="${MPS_RANKS:-8}" +MPS_THREADS="${MPS_THREADS:-12}" +TN_RANKS="${TN_RANKS:-8}" +TN_THREADS="${TN_THREADS:-12}" + +export OMP_NUM_THREADS="${OMP_NUM_THREADS:-1}" +export MKL_NUM_THREADS="${MKL_NUM_THREADS:-1}" + +estimate_mps_memory() { + local nqubits="$1" + local bond="$2" + "$PYTHON_BIN" - "$nqubits" "$bond" "$MPS_RANKS" <<'PY' +import sys +n = int(sys.argv[1]) +chi = int(sys.argv[2]) +ranks = int(sys.argv[3]) +resident = n * 2 * chi * chi * 16 +per_rank = resident / ranks +print( + "MPS rough resident memory: " + f"total={resident / 1024**3:.1f} GiB " + f"per_rank={per_rank / 1024**3:.1f} GiB " + "(temporary eig/SVD workspaces are additional)" +) +PY +} + +run_timed() { + echo + echo "--------------------------------------------------------------------------------" + echo "$*" + echo "--------------------------------------------------------------------------------" + "$TIME_BIN" -v "$@" +} + +run_mps_case() { + local label="$1" + local nqubits="$2" + local nlayers="$3" + local bond="$4" + shift 4 + echo + echo "================================================================================" + echo "$label" + echo "================================================================================" + echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC" + echo "MPS_RANKS=$MPS_RANKS MPS_THREADS=$MPS_THREADS" + echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS" + estimate_mps_memory "$nqubits" "$bond" + run_timed "$MPIEXEC" -n "$MPS_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \ + --mpi --mps \ + --nqubits "$nqubits" \ + --nlayers "$nlayers" \ + --bond "$bond" \ + --torch-threads "$MPS_THREADS" \ + "$@" +} + +run_tn_case() { + local label="$1" + local nqubits="$2" + local nlayers="$3" + shift 3 + echo + echo "================================================================================" + echo "$label" + echo "================================================================================" + echo "PYTHON_BIN=$PYTHON_BIN MPIEXEC=$MPIEXEC" + echo "TN_RANKS=$TN_RANKS TN_THREADS=$TN_THREADS" + echo "OMP_NUM_THREADS=$OMP_NUM_THREADS MKL_NUM_THREADS=$MKL_NUM_THREADS" + echo "TN memory is contraction-tree dependent; increase --tn-target-slices if RSS is high." + run_timed "$MPIEXEC" -n "$TN_RANKS" "$PYTHON_BIN" $PYTHON_FLAGS benchmark_cpu_expectation.py \ + --mpi \ + --nqubits "$nqubits" \ + --nlayers "$nlayers" \ + --torch-threads "$TN_THREADS" \ + "$@" +} + +case "${1:-help}" in + probe) + run_mps_case "MPS probe: n=40 layers=30 bond=2048" 40 30 2048 \ + --circuits brickwall_cnot \ + --observables ring_xz + + run_tn_case "TN probe: n=28 layers=12 target_slices=8" 28 12 \ + --circuits brickwall_cnot \ + --observables ring_xz \ + --tn-target-slices 8 + ;; + + mps-medium) + run_mps_case "MPS medium: n=56 layers=40 bond=3072" 56 40 3072 \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz mixed_local range2_xx + ;; + + mps-long) + run_mps_case "MPS long: n=64 layers=48 bond=4096" 64 48 4096 \ + --circuits brickwall_cnot reversed_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz mixed_local range2_xx + ;; + + tn-medium) + run_tn_case "TN medium: n=32 layers=16 target_slices=16" 32 16 \ + --circuits brickwall_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz range2_xx \ + --tn-target-slices 16 + ;; + + tn-long) + run_tn_case "TN long: n=36 layers=20 target_slices=32" 36 20 \ + --circuits brickwall_cnot shifted_cz rxx_rzz \ + --observables ring_xz open_zz range2_xx \ + --tn-target-slices 32 + ;; + + help|*) + cat >&2 <<'EOF' +Usage: tools/run_cpu_single_cases.sh [probe|mps-medium|mps-long|tn-medium|tn-long] + +Common overrides: + PYTHON_BIN=.venv/bin/python + MPIEXEC=mpiexec + MPS_RANKS=8 MPS_THREADS=12 + TN_RANKS=8 TN_THREADS=12 + OMP_NUM_THREADS=1 MKL_NUM_THREADS=1 +EOF + exit 2 + ;; +esac diff --git a/tools/run_vidal_segment_mpi_scan.sh b/tools/run_vidal_segment_mpi_scan.sh new file mode 100755 index 0000000..49dc138 --- /dev/null +++ b/tools/run_vidal_segment_mpi_scan.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash +set -euo pipefail + +NQ="${NQ:-34}" +LAYERS="${LAYERS:-20}" +BOND="${BOND:-512}" +SEED="${SEED:-42}" +RANKS="${RANKS:-1 2 4}" +THREADS="${THREADS:-32 32 16}" +PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}" +MPIEXEC="${MPIEXEC:-mpiexec}" +CIRCUIT="${CIRCUIT:-brickwall_cnot}" +OBSERVABLE="${OBSERVABLE:-ring_xz}" +EXACT="${EXACT:-0}" + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT_DIR" + +if [[ "${1:-help}" != "run" ]]; then + cat >&2 <<'EOF' +Usage: tools/run_vidal_segment_mpi_scan.sh run + +Overrides: + NQ=34 LAYERS=20 BOND=512 SEED=42 + RANKS="1 2 4" THREADS="32 32 16" + CIRCUIT=brickwall_cnot OBSERVABLE=ring_xz + EXACT=1 + PYTHON_BIN=.venv/bin/python MPIEXEC=mpiexec +EOF + if [[ "${1:-help}" == "help" ]]; then + exit 0 + fi + exit 2 +fi + +read -r -a ranks <<< "$RANKS" +read -r -a threads <<< "$THREADS" + +if [[ "${#ranks[@]}" != "${#threads[@]}" ]]; then + echo "RANKS and THREADS must have the same number of entries." >&2 + exit 2 +fi + +common=( + --nqubits "$NQ" + --nlayers "$LAYERS" + --bond "$BOND" + --seed "$SEED" + --mps + --circuits "$CIRCUIT" + --observables "$OBSERVABLE" +) + +if [[ "$EXACT" == "1" ]]; then + common+=(--exact) +fi + +for idx in "${!ranks[@]}"; do + nrank="${ranks[$idx]}" + nthr="${threads[$idx]}" + if [[ "$nrank" == "1" ]]; then + echo "== Vidal serial ranks=1 torch_threads=$nthr ==" + "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + "${common[@]}" --torch-threads "$nthr" + else + echo "== Vidal segmented MPI ranks=$nrank torch_threads=$nthr ==" + "$MPIEXEC" -n "$nrank" "$PYTHON_BIN" -u benchmark_cpu_expectation.py \ + "${common[@]}" --torch-threads "$nthr" --mpi + fi +done diff --git a/tools/validate_vidal_mpi_correctness.py b/tools/validate_vidal_mpi_correctness.py new file mode 100644 index 0000000..bce8e2d --- /dev/null +++ b/tools/validate_vidal_mpi_correctness.py @@ -0,0 +1,202 @@ +"""Correctness checks for the Vidal/TEBD MPS fast path. + +The cases here intentionally cover more than the benchmark ring-XZ observable: +different nearest-neighbor gate orientations and several Pauli-sum observables. +Run serially to compare qibojit/statevector vs Vidal, or under MPI to compare +the segmented Vidal executor. +""" + +from __future__ import annotations + +import argparse +import math +import time + +import numpy as np +import torch +from qibo import Circuit, gates + +from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor +from qibotn.backends.vidal_tebd import VidalTEBDExecutor + + +def build_circuit(kind, nqubits, nlayers, seed): + rng = np.random.default_rng(seed) + circuit = Circuit(nqubits) + for layer in range(nlayers): + for q in range(nqubits): + circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi))) + circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi))) + if kind == "rx_ry_cz": + circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi))) + + if kind in ("brickwall", "reversed_cnot"): + for q in range(0, nqubits - 1, 2): + if kind == "reversed_cnot" and (layer % 2): + circuit.add(gates.CNOT(q + 1, q)) + else: + circuit.add(gates.CNOT(q, q + 1)) + for q in range(1, nqubits - 1, 2): + if kind == "reversed_cnot" and not (layer % 2): + circuit.add(gates.CNOT(q + 1, q)) + else: + circuit.add(gates.CNOT(q, q + 1)) + elif kind == "rx_ry_cz": + for q in range(layer % 2, nqubits - 1, 2): + circuit.add(gates.CZ(q, q + 1)) + else: + raise ValueError(f"Unknown circuit kind {kind!r}.") + return circuit + + +def observable_terms(kind, nqubits): + if kind == "ring_xz": + return [ + (0.5, (("X", site), ("Z", (site + 1) % nqubits))) + for site in range(nqubits) + ] + if kind == "open_zz": + return [ + (1.0 / (nqubits - 1), (("Z", site), ("Z", site + 1))) + for site in range(nqubits - 1) + ] + if kind == "mixed_local": + terms = [(0.25, (("X", 0),)), (-0.5, (("Z", nqubits - 1),))] + terms += [ + (0.125, (("Y", site), ("Y", site + 1))) + for site in range(0, nqubits - 1, 3) + ] + return terms + raise ValueError(f"Unknown observable kind {kind!r}.") + + +def exact_pauli_sum(circuit, terms, nqubits): + state = circuit().state(numpy=True).reshape(-1) + indices = np.arange(state.size, dtype=np.int64) + value = 0.0 + 0.0j + for coeff, ops in terms: + flipped = indices.copy() + phase = np.ones(state.size, dtype=np.complex128) + for name, site in ops: + shift = nqubits - 1 - site + bit = (indices >> shift) & 1 + name = name.upper() + if name == "X": + flipped ^= 1 << shift + elif name == "Y": + flipped ^= 1 << shift + phase *= 1j * (1 - 2 * bit) + elif name == "Z": + phase *= 1 - 2 * bit + elif name != "I": + raise ValueError(f"Unsupported Pauli {name!r}.") + value += coeff * np.vdot(state[flipped], phase * state) + return float(value.real) + + +def run_vidal(circuit, terms, nqubits, bond, tensor_module): + executor = VidalTEBDExecutor( + nqubits=nqubits, + max_bond=bond, + cut_ratio=1e-12, + tensor_module=tensor_module, + ) + executor.run_circuit(circuit) + return float(executor.expectation_pauli_sum(terms)) + + +def run_segment_mpi(circuit, terms, nqubits, bond, tensor_module, comm): + executor = SegmentVidalMPIExecutor( + nqubits=nqubits, + max_bond=bond, + cut_ratio=1e-12, + tensor_module=tensor_module, + comm=comm, + ) + executor.run_circuit(circuit) + return executor.expectation_pauli_sum_root(terms) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--nqubits", type=int, default=16) + parser.add_argument("--nlayers", type=int, default=6) + parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512) + parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--tensor-module", choices=("torch", "numpy"), default="torch") + parser.add_argument("--torch-threads", type=int, default=32) + parser.add_argument("--mpi", action="store_true") + parser.add_argument( + "--circuits", + nargs="+", + default=("brickwall", "reversed_cnot", "rx_ry_cz"), + ) + parser.add_argument( + "--observables", + nargs="+", + default=("ring_xz", "open_zz", "mixed_local"), + ) + args = parser.parse_args() + + torch.set_num_threads(args.torch_threads) + comm = None + rank = 0 + size = 1 + if args.mpi: + from mpi4py import MPI + + comm = MPI.COMM_WORLD + rank = comm.Get_rank() + size = comm.Get_size() + + if rank == 0: + mode = f"vidal-segment-mpi/{size}" if args.mpi else "vidal" + print( + f"mode={mode} nqubits={args.nqubits} nlayers={args.nlayers} " + f"bond={args.bond} tensor_module={args.tensor_module}" + ) + print("circuit observable exact value abs_error seconds") + + for circuit_kind in args.circuits: + circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed) + exact = None + if rank == 0: + exact_values = { + obs: exact_pauli_sum( + circuit, observable_terms(obs, args.nqubits), args.nqubits + ) + for obs in args.observables + } + else: + exact_values = None + if comm is not None: + exact_values = comm.bcast(exact_values, root=0) + + for obs_kind in args.observables: + terms = observable_terms(obs_kind, args.nqubits) + start = time.perf_counter() + if args.mpi: + value = run_segment_mpi( + circuit, + terms, + args.nqubits, + args.bond, + args.tensor_module, + comm, + ) + else: + value = run_vidal( + circuit, terms, args.nqubits, args.bond, args.tensor_module + ) + if rank != 0: + continue + elapsed = time.perf_counter() - start + exact = exact_values[obs_kind] + print( + f"{circuit_kind} {obs_kind} {exact:.16e} {value:.16e} " + f"{abs(value - exact):.6e} {elapsed:.3f}" + ) + + +if __name__ == "__main__": + main()