赛前稳定版
Some checks failed
Build wheels / build (ubuntu-latest, 3.11) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.12) (push) Has been cancelled
Build wheels / build (ubuntu-latest, 3.13) (push) Has been cancelled
Tests / check (push) Has been cancelled
Tests / build (ubuntu-latest, 3.11) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.12) (push) Has been cancelled
Tests / build (ubuntu-latest, 3.13) (push) Has been cancelled

This commit is contained in:
2026-05-15 09:32:26 +08:00
parent 72f95599bb
commit 915c24dc7b
40 changed files with 5542 additions and 224 deletions

View File

@@ -3,6 +3,10 @@
from __future__ import annotations
import argparse
import os
import subprocess
from pathlib import Path
from urllib.parse import urlparse
from qibotn.benchmark_cases import (
CIRCUITS,
@@ -19,6 +23,51 @@ from qibotn.expectation_runner import (
)
def optional_int(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return int(text)
def optional_float(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return float(text)
def format_optional(value, fmt="g"):
return "None" if value is None else format(value, fmt)
def should_stop_dask(args):
return (
not args.keep_dask
and args.tn_search_backend == "dask"
and args.dask_address is not None
and args.tn_load_tree is None
)
def stop_dask_cluster(args, rank):
if rank != 0 or not should_stop_dask(args):
return
script = Path(__file__).resolve().parent / "tools" / "manage_tn_dask_cluster.sh"
if not script.exists():
print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
return
env = os.environ.copy()
parsed = urlparse(args.dask_address)
if parsed.hostname:
env.setdefault("SCHEDULER_HOST", parsed.hostname)
if parsed.port:
env.setdefault("SCHEDULER_PORT", str(parsed.port))
print("dask_stop_after_search start", flush=True)
subprocess.run([str(script), "stop"], cwd=str(script.parent.parent), env=env, check=False)
print("dask_stop_after_search done", flush=True)
def build_parallel_opts(args):
slicing_opts = {}
if args.tn_target_slices is not None:
@@ -31,11 +80,24 @@ def build_parallel_opts(args):
"search_workers": args.tn_search_workers or args.torch_threads,
"max_repeats": args.tn_search_repeats,
"max_time": args.tn_search_time,
"print_stats": not args.no_tn_stats,
}
if args.tn_search_backend is not None:
opts["search_backend"] = args.tn_search_backend
if args.dask_address is not None:
opts["dask_address"] = args.dask_address
if args.tn_save_tree is not None:
opts["save_tree_path"] = args.tn_save_tree
if args.tn_load_tree is not None:
opts["load_tree_path"] = args.tn_load_tree
if args.tn_search_only:
opts["search_only"] = True
if args.tn_debug_trials:
opts["debug_trials"] = True
if args.tn_contract_implementation is not None:
opts["contract_implementation"] = args.tn_contract_implementation
if args.dask_close_workers:
opts["dask_close_workers"] = True
return opts
@@ -43,10 +105,16 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=40)
parser.add_argument("--nlayers", type=int, default=30)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=1024)
parser.add_argument("--cut-ratio", type=float, default=1e-12)
parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--torch-threads", type=int, default=8)
parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
parser.add_argument(
"--dtype",
choices=("complex128", "complex64"),
default="complex128",
)
parser.add_argument("--ansatz", choices=("tn", "mps"), default=None)
parser.add_argument("--mps", action="store_true")
parser.add_argument("--mpi", action="store_true")
@@ -56,19 +124,62 @@ def main():
parser.add_argument("--observables", nargs="+", default=["ring_xz"])
parser.add_argument("--pauli-pattern")
parser.add_argument("--tn-target-slices", type=int)
parser.add_argument("--tn-target-size", type=int)
parser.add_argument("--tn-target-size", type=int,default=2**32)
parser.add_argument("--tn-search-workers", type=int)
parser.add_argument("--tn-search-repeats", type=int, default=128)
parser.add_argument("--tn-search-time", type=float, default=60.0)
parser.add_argument(
"--no-tn-stats",
action="store_true",
help="Do not print per-term TN search/contraction diagnostics.",
)
parser.add_argument(
"--tn-search-backend",
choices=("processpool", "dask"),
default="dask",
help="Path-search backend. In MPI mode, dask search runs only on rank 0 and broadcasts the tree.",
)
parser.add_argument(
"--dask-address",
help="Dask scheduler address, for example tcp://host:8786. If omitted with dask search, a local cluster is created.",
)
parser.add_argument(
"--dask-close-workers",
action="store_true",
help="After dask path search, ask the scheduler to close all currently connected workers.",
)
parser.add_argument(
"--keep-dask",
action="store_true",
help=(
"Keep an external dask cluster running after search. By default, "
"tools/manage_tn_dask_cluster.sh stop is called after search when "
"--dask-address is used."
),
)
parser.add_argument(
"--tn-save-tree",
help="Save searched cotengra contraction tree(s) to this pickle file.",
)
parser.add_argument(
"--tn-load-tree",
help="Load cotengra contraction tree(s) from this pickle file and skip path search.",
)
parser.add_argument(
"--tn-search-only",
action="store_true",
help="Only run path search and optional --tn-save-tree; skip contraction.",
)
parser.add_argument(
"--tn-debug-trials",
action="store_true",
help="Print dask worker summary and per-trial worker start/done logs.",
)
parser.add_argument(
"--tn-contract-implementation",
choices=("auto", "cotengra", "autoray", "cpp"),
help="cotengra contraction implementation for TN contraction.",
)
args = parser.parse_args()
ansatz = "mps" if args.mps else (args.ansatz or "tn")
@@ -89,6 +200,8 @@ def main():
bond=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
quimb_backend=args.quimb_backend,
dtype=args.dtype,
torch_threads=args.torch_threads,
parallel_opts=build_parallel_opts(args),
)
@@ -98,12 +211,15 @@ def main():
print(
f"backend=cpu ansatz={ansatz.upper()} mode={mode} "
f"nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} cut_ratio={args.cut_ratio:g} seed={args.seed} "
f"bond={format_optional(args.bond)} "
f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
f"torch_threads={args.torch_threads} "
f"tn_search_backend={args.tn_search_backend or 'processpool'}"
f"tn_search_backend={args.tn_search_backend}"
)
print("circuit observable exact value abs_error rel_error seconds")
try:
for circuit_kind in circuits:
circuit = build_circuit(circuit_kind, args.nqubits, args.nlayers, args.seed)
named_observables = (
@@ -139,6 +255,30 @@ def main():
f"{circuit_kind} {obs_name} {exact_text} {result.value:.16e} "
f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}"
)
for stat in result.parallel_stats or ():
cost = stat["path_cost"]
search_stats = stat.get("search_stats", {})
print(
"tn_term_summary "
f"term={stat.get('term_index', 0)} "
f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
f"completed_trials={search_stats.get('completed_trials', 'na')} "
f"finite_trials={search_stats.get('finite_trials', 'na')} "
f"failed_trials={search_stats.get('failed_trials', 'na')} "
f"requested_trials={search_stats.get('requested_trials', 'na')} "
f"best_score={search_stats.get('best_score', float('nan')):.6g} "
f"slices={cost['nslices']} "
f"log10_flops={cost['log10_flops']:.3f} "
f"log10_write={cost['log10_write']:.3f} "
f"log2_size={cost['log2_size']:.3f} "
f"log10_combo={cost['log10_combo']:.3f} "
f"peak_memory_gib={cost['peak_memory_gib']:.6g} "
f"slicing_overhead={cost['slicing_overhead']:.6g} "
f"rank_slices={stat.get('rank_slices', 'na')}"
)
finally:
stop_dask_cluster(args, rank)
if __name__ == "__main__":

254
docs/contest_runners.md Normal file
View File

@@ -0,0 +1,254 @@
# Contest Runners
This directory contains two self-contained contest entrypoints:
- `tools/tn_contest_runner.py`: general tensor-network path search and contraction.
- `tools/mps_contest_runner.py`: Vidal/MPS multi-node expectation runner.
Both scripts keep circuit and observable definitions inside the script so a
contest case can be edited in one place.
## Environment
Run commands from the repository root:
```bash
cd /home/yx/qibotn
```
For Intel MPI on two nodes, use the known working style:
```bash
mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 ...
```
Set `TCM_ENABLE=1` for CPU runs:
```bash
export TCM_ENABLE=1
```
## TN Workflow
List built-in TN contest cases:
```bash
python -u tools/tn_contest_runner.py list
```
TN path search uses dask by default. Without `--dask-address`, the script starts
a local dask cluster. For multiple servers, start one scheduler and workers
with the helper script, then pass the scheduler address to the search command.
Start the default two-node dask cluster:
```bash
cd /home/yx/qibotn
tools/manage_tn_dask_cluster.sh start
```
Check status:
```bash
cd /home/yx/qibotn
tools/manage_tn_dask_cluster.sh status
```
Stop the cluster:
```bash
cd /home/yx/qibotn
tools/manage_tn_dask_cluster.sh stop
```
The helper defaults are:
```bash
SCHEDULER_HOST=10.20.1.103
WORKER_HOSTS="10.20.1.103 10.20.1.102"
NWORKERS=48
NTHREADS=1
ROOT_DIR=/home/yx/qibotn
PYTHON_BIN=.venv/bin/python
DASK_WORKER_TTL="24 hours"
DASK_TICK_LIMIT="30 minutes"
DASK_LOST_WORKER_TIMEOUT="30 minutes"
```
Override them inline if needed:
```bash
WORKER_HOSTS="10.20.1.103 10.20.1.102" NWORKERS=48 \
tools/manage_tn_dask_cluster.sh restart
```
Check that both nodes are connected by adding `--tn-debug-trials` to a small
search. The output should include `qibotn_dask_workers` with both hosts.
`tools/tn_contest_runner.py search` stops the external dask cluster after the
search phase by default. Pass `--keep-dask` if you want to reuse the same dask
cluster for several searches.
Use enough trials to fill the cluster. With the default two-node setup there are
96 worker slots, so `--tn-search-repeats` should be at least 96. The contest
runner default is 2048.
Cotengra trials are CPU-bound and can hold the Python GIL long enough for dask
to report `Event loop was unresponsive`. Dask defaults are much more aggressive:
`scheduler.worker-ttl=5 minutes`, `admin.tick.limit=3s`, and
`deploy.lost-worker-timeout=15s`. The helper script raises these limits so
workers are not killed by dask during search. The intended timeout is
`--tn-search-time`; after that, the runner stops the external dask cluster.
Small correctness check against statevector:
```bash
python -u tools/tn_contest_runner.py validate \
--case main1 \
--nqubits 8 \
--nlayers 2 \
--torch-threads 4 \
--tn-search-repeats 8 \
--tn-search-time 5
```
Search and save contraction trees:
```bash
TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \
--case main1 \
--torch-threads 48 \
--dtype complex64 \
--dask-address tcp://10.20.1.103:8786 \
--tn-search-repeats 2048 \
--tn-search-time 300
```
Contract using the saved tree on one node:
```bash
TCM_ENABLE=1 mpirun -np 2 python -u tools/tn_contest_runner.py contract \
--mpi \
--case main1 \
--torch-threads 48 \
--dtype complex64
```
Contract using the saved tree on two nodes:
```bash
TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
python -u tools/tn_contest_runner.py contract \
--mpi \
--case main1 \
--torch-threads 48 \
--dtype complex64
```
Run search and contract in one command:
```bash
TCM_ENABLE=1 python -u tools/tn_contest_runner.py all \
--case main1 \
--torch-threads 48 \
--dtype complex64 \
--dask-address tcp://10.20.1.103:8786 \
--tn-search-repeats 2048 \
--tn-search-time 300
```
Run only selected observables:
```bash
python -u tools/tn_contest_runner.py search \
--case main2 \
--observables open_zz
```
Tree files are written to `trees/contest_tn/` by default. The tree filename
contains case, observable, qubit count, layer count, and target slice count.
If any of these change, search again.
Edit TN contest cases in `tools/tn_contest_runner.py`:
- `CASES`: case name, circuit kind, observable list, default scale.
- `build_circuit`: circuit definitions.
- `pauli_sum_observable`: observable definitions.
## MPS Workflow
List built-in Vidal/MPS contest cases:
```bash
python -u tools/mps_contest_runner.py list
```
Small correctness check against statevector:
```bash
mpirun -np 2 python -u tools/mps_contest_runner.py validate \
--case main1 \
--nqubits 8 \
--nlayers 2 \
--bond 64 \
--torch-threads 4
```
Run one MPS case on one node:
```bash
TCM_ENABLE=1 mpirun -np 2 python -u tools/mps_contest_runner.py run \
--case main1 \
--torch-threads 48
```
Run one MPS case on two nodes:
```bash
TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
python -u tools/mps_contest_runner.py run \
--case main1 \
--torch-threads 48
```
Run only one observable:
```bash
TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
python -u tools/mps_contest_runner.py run \
--case main1 \
--observables ring_xz \
--torch-threads 48
```
Override scale:
```bash
TCM_ENABLE=1 mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2 \
python -u tools/mps_contest_runner.py run \
--case main1 \
--nqubits 128 \
--nlayers 24 \
--bond 1024 \
--torch-threads 48
```
Edit MPS contest cases in `tools/mps_contest_runner.py`:
- `CASES`: case name, circuit kind, observable list, default scale and bond.
- `build_circuit`: circuit definitions.
- `observable`: observable definitions, including dense local terms.
## Notes
- TN uses path search plus contraction. Reuse tree files only for the exact same
circuit, observable, qubit count, layer count, seed, and slicing setup.
- TN path search defaults to dask. Use `--tn-search-backend processpool` only
for fallback/debugging.
- Prefer the default `--tn-target-size 4294967296` memory target. Do not force
`--tn-target-slices` unless you have already verified that cotengra can find
valid trees for that exact setting.
- MPS/Vidal does not use contraction-tree search. It runs the circuit directly
and reports `trunc_sum` and `trunc_max`.
- Default TN contraction is the stable torch/quimb path. Do not pass
`--tn-contract-implementation cpp` for contest runs.

View File

@@ -1,2 +1,2 @@
10.20.6.74
#10.20.6.102
10.20.1.103:2
10.20.1.102:2

View File

@@ -30,7 +30,7 @@ class MetaBackend:
elif platform == "quimb": # pragma: no cover
import qibotn.backends.quimb as qmb
quimb_backend = kwargs.get("quimb_backend", "numpy")
quimb_backend = kwargs.get("quimb_backend", "torch")
contraction_optimizer = kwargs.get("contraction_optimizer", "auto-hq")
return qmb.BACKENDS[quimb_backend](
quimb_backend=quimb_backend, contraction_optimizer=contraction_optimizer

View File

@@ -3,6 +3,9 @@
from __future__ import annotations
import os
import pickle
import time
from pathlib import Path
import numpy as np
from qibo import hamiltonians
@@ -10,7 +13,12 @@ from qibo.backends import NumpyBackend
from qibo.config import raise_error
from qibotn.backends.abstract import QibotnBackend
from qibotn.backends.vidal import _symbolic_hamiltonian_to_pauli_terms, _unsupported_reason
from qibotn.backends.vidal import (
_observable_mpo_tensors,
_operator_terms_to_mpo,
_symbolic_hamiltonian_to_operator_terms,
_unsupported_reason,
)
from qibotn.backends.vidal_mpi_segment import SegmentVidalMPIExecutor
from qibotn.backends.vidal_tebd import VidalTEBDExecutor
from qibotn.observables import check_observable
@@ -23,6 +31,51 @@ def _as_bool_or_dict(value, name):
raise TypeError(f"{name} has an unexpected type")
def _bind_numa_node(rank):
"""Bind the calling process (or thread) to the NUMA node for *rank*.
The MPI rank is converted to a local (per-node) rank through the
environment variables commonly set by Open MPI, MVAPICH, and Slurm.
The process CPU affinity and NUMA memory policy are set accordingly.
Returns the NUMA domain that was selected, or ``None`` if the binding
could not be determined.
"""
local_rank = rank
for name in (
"OMPI_COMM_WORLD_LOCAL_RANK",
"MV2_COMM_WORLD_LOCAL_RANK",
"MPI_LOCALRANKID",
"SLURM_LOCALID",
):
try:
local_rank = int(os.environ[name])
break
except (KeyError, ValueError):
pass
domain = local_rank % 2
cpulist = f"/sys/devices/system/node/node{domain}/cpulist"
try:
cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip())
if cpus:
os.sched_setaffinity(0, cpus)
except (FileNotFoundError, OSError):
pass
try:
import ctypes
libnuma = ctypes.CDLL("libnuma.so.1")
if libnuma.numa_available() >= 0:
libnuma.numa_run_on_node(ctypes.c_int(domain))
libnuma.numa_set_preferred(ctypes.c_int(domain))
except Exception:
pass
return domain
def _parse_cpu_list(text):
cpus = set()
for item in text.split(","):
@@ -90,8 +143,15 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
),
)
self.tensor_module = runcard.get("tensor_module", "torch")
self.dtype = runcard.get("dtype", "complex128")
self.compile_circuit = bool(runcard.get("compile_circuit", False))
self.preprocess = bool(runcard.get("preprocess", False))
self.mpi_term_batch_size = runcard.get(
"mpi_term_batch_size",
runcard.get("term_batch_size", None),
)
self.torch_threads = runcard.get("torch_threads", None)
self.quimb_backend = runcard.get("quimb_backend", "numpy")
self.quimb_backend = runcard.get("quimb_backend", "torch")
self.contraction_optimizer = runcard.get("contraction_optimizer", "auto-hq")
self.parallel_opts = runcard.get("parallel_opts", {})
self.parallel_stats = []
@@ -117,7 +177,8 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
value = self.expectation(circuit, self.observable)
if self.MPI_enabled and self.rank > 0:
return np.asarray([0], dtype=np.int64)
return np.asarray([value], dtype=np.float64)
dtype = np.complex128 if np.iscomplexobj(value) else np.float64
return np.asarray([value], dtype=dtype)
backend = self._quimb_backend()
backend.configure_tn_simulation(
@@ -131,13 +192,26 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
return_array=return_array,
)
def expectation(self, circuit, observable=None, preprocess=True, compile_circuit=None):
def expectation(self, circuit, observable=None, preprocess=None, compile_circuit=None):
mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
if mpo_tensors is None:
observable = check_observable(observable, circuit.nqubits)
use_preprocess = self.preprocess if preprocess is None else preprocess
if mpo_tensors is not None and not self.MPS_enabled:
raise_error(
NotImplementedError,
"MPO expectation is currently supported only by the Vidal MPS path.",
)
if self.MPS_enabled:
reason = _unsupported_reason(circuit)
if reason is None:
return self._vidal_expectation(circuit, observable)
if reason is None or self.compile_circuit or use_preprocess:
return self._vidal_expectation(
circuit,
observable,
preprocess=use_preprocess,
compile_circuit=compile_circuit,
)
backend = self._quimb_backend()
backend.configure_tn_simulation(
@@ -149,8 +223,45 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
return self._quimb_expectation_mpi(backend, circuit, observable)
return self._quimb_expectation_processpool(backend, circuit, observable)
def _vidal_expectation(self, circuit, observable):
terms = _symbolic_hamiltonian_to_pauli_terms(observable)
def _vidal_expectation(
self, circuit, observable, preprocess=False, compile_circuit=None
):
if compile_circuit is None:
compile_circuit = self.compile_circuit
if preprocess:
if self.MPI_enabled:
from mpi4py import MPI
self.rank = MPI.COMM_WORLD.Get_rank()
from qibotn.backends.vidal import VidalBackend
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=self.max_bond_dimension,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
compile_circuit=compile_circuit,
mpi_approach="CT" if self.MPI_enabled else "SR",
mpi_term_batch_size=self.mpi_term_batch_size,
fallback=False,
)
value = backend.expectation(
circuit,
observable,
preprocess=True,
compile_circuit=compile_circuit,
)
self.rank = getattr(backend, "rank", self.rank)
self.last_truncation_error = getattr(
backend, "last_truncation_error", np.nan
)
self.last_max_truncation_error = getattr(
backend, "last_max_truncation_error", np.nan
)
return value
mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
if self.MPI_enabled:
from mpi4py import MPI
@@ -164,8 +275,18 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
comm=comm,
)
executor.run_circuit(circuit)
value = executor.expectation_pauli_sum_root(terms)
return np.nan if self.rank != 0 else float(np.real(value))
self.last_truncation_error = float(executor.global_truncation_error())
self.last_max_truncation_error = float(
executor.global_max_truncation_error()
)
if mpo_tensors is not None:
value = executor.expectation_mpo_root(mpo_tensors)
else:
terms = _symbolic_hamiltonian_to_operator_terms(observable)
value = executor.expectation_mpo_root(
_operator_terms_to_mpo(terms, circuit.nqubits)
)
return np.nan if self.rank != 0 else value
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
@@ -174,7 +295,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
tensor_module=self.tensor_module,
)
executor.run_circuit(circuit)
return float(np.real(executor.expectation_pauli_sum(terms)))
self.last_truncation_error = float(executor.truncation_error)
self.last_max_truncation_error = float(executor.max_truncation_error)
if mpo_tensors is not None:
return executor.expectation_mpo(mpo_tensors)
terms = _symbolic_hamiltonian_to_operator_terms(observable)
return executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits))
def _quimb_backend(self):
import qibotn.backends.quimb as qmb
@@ -185,26 +311,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
)
def _bind_rank_to_numa_domain(self, rank):
domain = rank % 2
cpulist = f"/sys/devices/system/node/node{domain}/cpulist"
try:
cpus = _parse_cpu_list(open(cpulist, encoding="utf-8").read().strip())
if cpus:
os.sched_setaffinity(0, cpus)
except (FileNotFoundError, OSError):
pass
try:
import ctypes
libnuma = ctypes.CDLL("libnuma.so.1")
if libnuma.numa_available() >= 0:
libnuma.numa_run_on_node(ctypes.c_int(domain))
libnuma.numa_set_preferred(ctypes.c_int(domain))
except Exception:
pass
self.numa_domain = domain
self.numa_domain = _bind_numa_node(rank)
def _default_search_workers(self, nranks=1):
if self.torch_threads:
@@ -259,6 +366,22 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
search_time = opts.get("max_time", 60)
search_backend = opts.get("search_backend")
dask_address = opts.get("dask_address")
dask_close_workers = bool(opts.get("dask_close_workers", False))
print_stats = bool(opts.get("print_stats", False))
debug_trials = bool(opts.get("debug_trials", False))
search_only = bool(opts.get("search_only", False))
save_tree_path = opts.get("save_tree_path")
load_tree_path = opts.get("load_tree_path")
loaded_trees = None
saved_trees = []
saved_costs = []
if load_tree_path:
with Path(load_tree_path).open("rb") as f:
payload = pickle.load(f)
loaded_trees = payload["trees"] if isinstance(payload, dict) else payload
if not isinstance(loaded_trees, (list, tuple)):
loaded_trees = [loaded_trees]
qc = backend._qibo_circuit_to_quimb(
circuit,
@@ -271,7 +394,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
total_value = 0.0 + 0.0j
terms = extract_gates_and_qubits(observable)
for coeff, factors in terms:
for term_index, (coeff, factors) in enumerate(terms):
if not factors:
if self.rank == 0:
total_value += coeff
@@ -297,6 +420,21 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
user_slicing_opts,
)
if loaded_trees is not None:
if term_index >= len(loaded_trees):
raise ValueError(
f"Loaded tree file has {len(loaded_trees)} tree(s), "
f"but term {term_index} was requested."
)
tree = loaded_trees[term_index]
search_seconds = 0.0
if self.rank == 0 and print_stats:
print(
f"tn_tree_loaded term={term_index} path={load_tree_path}",
flush=True,
)
else:
search_start = time.perf_counter()
tree = parallel_path_search(
tn,
tn.outer_inds(),
@@ -308,20 +446,86 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
trial_timeout=opts.get("trial_timeout"),
search_backend=search_backend,
dask_address=dask_address,
debug_trials=debug_trials,
dask_close_workers=dask_close_workers,
)
search_seconds = time.perf_counter() - search_start
if tree is None:
raise RuntimeError("Failed to find a contraction tree for CPU TN MPI.")
if self.parallel_opts.get("contract_implementation") == "cpp":
from qibotn.torch_contractor import prepare_torch_cpp_contractor
if int(getattr(tree, "multiplicity", 1)) <= 1:
if self.rank == 0:
value = self._contract_term_unsliced(tn, tree, backend)
prepare_torch_cpp_contractor(tree)
path_cost = contraction_tree_costs(tree)
search_stats = getattr(tree, "qibotn_search_stats", {})
if save_tree_path and loaded_trees is None:
saved_trees.append(tree)
saved_costs.append(path_cost)
if self.rank == 0 and print_stats:
print(
"tn_search_done "
f"term={term_index} "
f"search_seconds={search_seconds:.3f} "
f"completed_trials={search_stats.get('completed_trials', 'na')} "
f"finite_trials={search_stats.get('finite_trials', 'na')} "
f"failed_trials={search_stats.get('failed_trials', 'na')} "
f"requested_trials={search_stats.get('requested_trials', search_repeats)} "
f"best_score={search_stats.get('best_score', float('nan')):.6g} "
f"slices={path_cost['nslices']} "
f"log10_flops={path_cost['log10_flops']:.3f} "
f"log10_write={path_cost['log10_write']:.3f} "
f"log2_size={path_cost['log2_size']:.3f} "
f"log10_combo={path_cost['log10_combo']:.3f} "
f"peak_memory_gib={path_cost['peak_memory_gib']:.6g}",
flush=True,
)
if search_only:
self.parallel_stats.append(
{
"term_index": term_index,
"term_factors": tuple(factors),
"path_cost": contraction_tree_costs(tree),
"path_cost": path_cost,
"search_stats": search_stats,
"tree_slices": int(getattr(tree, "multiplicity", 1)),
"slice_assignment": "search_only",
"rank_slices": [],
"search_seconds": search_seconds,
"contract_seconds": 0.0,
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
"search_backend": search_backend or method,
"dask_address": dask_address,
"numa_domain": getattr(self, "numa_domain", None),
}
)
continue
if comm is None and int(getattr(tree, "multiplicity", 1)) <= 1:
if self.rank == 0:
contract_start = time.perf_counter()
value = self._contract_term_unsliced(tn, tree, backend)
contract_seconds = time.perf_counter() - contract_start
if print_stats:
print(
"tn_contract_done "
f"term={term_index} "
f"contract_seconds={contract_seconds:.3f}",
flush=True,
)
self.parallel_stats.append(
{
"term_index": term_index,
"term_factors": tuple(factors),
"path_cost": path_cost,
"search_stats": search_stats,
"tree_slices": 1,
"slice_assignment": "root",
"rank_slices": [1] + [0] * (size - 1),
"search_seconds": search_seconds,
"contract_seconds": contract_seconds,
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
@@ -334,14 +538,27 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
continue
if comm is None:
contract_start = time.perf_counter()
value = self._contract_term_unsliced(tn, tree, backend)
contract_seconds = time.perf_counter() - contract_start
if print_stats:
print(
"tn_contract_done "
f"term={term_index} "
f"contract_seconds={contract_seconds:.3f}",
flush=True,
)
self.parallel_stats.append(
{
"term_index": term_index,
"term_factors": tuple(factors),
"path_cost": contraction_tree_costs(tree),
"path_cost": path_cost,
"search_stats": search_stats,
"tree_slices": int(getattr(tree, "multiplicity", 1)),
"slice_assignment": "local",
"rank_slices": [int(getattr(tree, "multiplicity", 1))],
"search_seconds": search_seconds,
"contract_seconds": contract_seconds,
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
@@ -353,6 +570,7 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
continue
contract_start = time.perf_counter()
arrays = self._term_arrays(tn, backend)
value, stats = parallel_contract(
tree,
@@ -360,18 +578,31 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
method="mpi",
comm=comm,
return_stats=True,
implementation=self.parallel_opts.get("contract_implementation"),
)
contract_seconds = time.perf_counter() - contract_start
gathered_stats = comm.gather(stats, root=0)
if rank == 0:
if print_stats:
print(
"tn_contract_done "
f"term={term_index} "
f"contract_seconds={contract_seconds:.3f}",
flush=True,
)
self.parallel_stats.append(
{
"term_index": term_index,
"term_factors": tuple(factors),
"path_cost": contraction_tree_costs(tree),
"path_cost": path_cost,
"search_stats": search_stats,
"tree_slices": stats.nslices,
"slice_assignment": stats.assignment,
"rank_slices": [
item.local_slices for item in gathered_stats
],
"search_seconds": search_seconds,
"contract_seconds": contract_seconds,
"search_workers": search_workers,
"search_repeats": search_repeats,
"search_time": search_time,
@@ -382,22 +613,73 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
)
total_value += coeff * complex(np.asarray(value).reshape(-1)[0])
if self.rank == 0 and save_tree_path and loaded_trees is None:
path = Path(save_tree_path)
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("wb") as f:
pickle.dump(
{
"trees": saved_trees,
"costs": saved_costs,
"nterms": len(saved_trees),
},
f,
protocol=pickle.HIGHEST_PROTOCOL,
)
if print_stats:
print(
f"tn_tree_saved path={save_tree_path} nterms={len(saved_trees)}",
flush=True,
)
if search_only:
return np.nan
return np.nan if rank != 0 else float(np.real(total_value))
def _contract_term_unsliced(self, tn, tree, backend):
contract_implementation = self.parallel_opts.get("contract_implementation")
if contract_implementation == "cpp":
if backend.backend != "torch":
raise ValueError("contract_implementation='cpp' requires torch backend.")
from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
from qibotn.torch_contractor import contract_tree_cpp
arrays = [
_torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
for array in tn.arrays
]
nslices = int(getattr(tree, "multiplicity", 1))
if nslices > 1:
total = None
for slice_id in range(nslices):
value = contract_tree_cpp(tree, tree.slice_arrays(arrays, slice_id))
total = value if total is None else total + value
return total
return contract_tree_cpp(tree, arrays)
if backend.backend == "torch":
import torch
from qibotn.backends.quimb import _torch_cpu_array
from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
for tensor in tn.tensors:
tensor._data = _torch_cpu_array(tensor._data, dtype=torch.complex128)
return tn.contract(all, output_inds=(), optimize=tree, backend="torch")
tensor._data = _torch_cpu_array(
tensor._data,
dtype=_torch_dtype(self.dtype),
)
return tn.contract(
all,
output_inds=(),
optimize=tree,
backend="torch",
implementation=contract_implementation,
)
return tn.contract(
all,
output_inds=(),
optimize=tree,
backend=backend.backend,
implementation=contract_implementation,
)
def _mpi_slicing_opts(self, user_slicing_opts):
@@ -405,11 +687,12 @@ class CpuTensorNet(QibotnBackend, NumpyBackend):
def _term_arrays(self, tn, backend):
if backend.backend == "torch":
import torch
from qibotn.backends.quimb import _torch_cpu_array
from qibotn.backends.quimb import _torch_cpu_array, _torch_dtype
return [
_torch_cpu_array(array, dtype=torch.complex128)
_torch_cpu_array(array, dtype=_torch_dtype(self.dtype))
for array in tn.arrays
]
return [backend.engine.asarray(array) for array in tn.arrays]
from qibotn.backends.quimb import _numpy_dtype
return [backend.engine.asarray(array, dtype=_numpy_dtype(self.dtype)) for array in tn.arrays]

View File

@@ -62,12 +62,26 @@ def _torch_cpu_array(data, dtype=None):
return x
def _arrays_to_backend(arrays, backend, engine):
if backend == "torch":
def _torch_dtype(dtype):
import torch
return [_torch_cpu_array(array, dtype=torch.complex128) for array in arrays]
return [engine.asarray(array) for array in arrays]
if dtype in ("complex64", "single"):
return torch.complex64
return torch.complex128
def _numpy_dtype(dtype):
import numpy as np
if dtype in ("complex64", "single"):
return np.complex64
return np.complex128
def _arrays_to_backend(arrays, backend, engine, dtype="complex128"):
if backend == "torch":
return [_torch_cpu_array(array, dtype=_torch_dtype(dtype)) for array in arrays]
return [engine.asarray(array, dtype=_numpy_dtype(dtype)) for array in arrays]
def _pauli_term_to_dense_operator(factors):
@@ -145,7 +159,7 @@ def pauli_product_expectation(
return tn.contract(all, output_inds=(), optimize=optimize, backend=backend)
def __init__(self, quimb_backend="numpy", contraction_optimizer="auto-hq"):
def __init__(self, quimb_backend="torch", contraction_optimizer="auto-hq"):
super(self.__class__, self).__init__()
self.name = "qibotn"
@@ -198,7 +212,7 @@ def circuit_ansatz(self):
def setup_backend_specifics(
self, quimb_backend="numpy", contractions_optimizer="auto-hq"
self, quimb_backend="torch", contractions_optimizer="auto-hq"
):
"""Setup backend specifics.
Args:
@@ -645,7 +659,7 @@ METHODS = {
}
def _generate_backend(quimb_backend: str = "numpy"):
def _generate_backend(quimb_backend: str = "torch"):
bases = (QibotnBackend,)
if quimb_backend == "numpy":
@@ -653,8 +667,13 @@ def _generate_backend(quimb_backend: str = "numpy"):
bases += (NumpyBackend,)
elif quimb_backend == "torch":
try:
from qiboml.backends import PyTorchBackend
except ImportError:
from qibo.backends import NumpyBackend
bases += (NumpyBackend,)
else:
bases += (PyTorchBackend,)
elif quimb_backend == "jax":
from qiboml.backends import JaxBackend

View File

@@ -39,6 +39,162 @@ def _symbolic_hamiltonian_to_pauli_terms(hamiltonian):
return terms
def _symbolic_hamiltonian_to_operator_terms(hamiltonian):
terms = []
factor_pattern = re.compile(r"([^\d]+)(\d+)")
paulis = {
"I": np.eye(2, dtype=np.complex128),
"X": np.array([[0, 1], [1, 0]], dtype=np.complex128),
"Y": np.array([[0, -1j], [1j, 0]], dtype=np.complex128),
"Z": np.array([[1, 0], [0, -1]], dtype=np.complex128),
}
for term in hamiltonian.terms:
ops_by_site = {}
for factor in term.factors:
site = getattr(factor, "target_qubit", None)
matrix = getattr(factor, "matrix", None)
if site is None or matrix is None:
match = factor_pattern.match(str(factor))
if match is None:
raise ValueError(f"Unsupported observable factor {factor!r}.")
name = match.group(1).upper()
if name not in paulis:
raise ValueError(f"Unsupported observable operator {name!r}.")
site = int(match.group(2))
matrix = paulis[name]
matrix = np.asarray(matrix, dtype=np.complex128)
site = int(site)
if site in ops_by_site:
ops_by_site[site] = ops_by_site[site] @ matrix
else:
ops_by_site[site] = matrix
terms.append((complex(term.coefficient), tuple(ops_by_site.items())))
return terms
def _dense_operator_to_product_terms(coeff, qubits, matrix):
"""Expand a dense k-local operator into product-matrix terms.
The dense matrix basis is ordered by the provided ``qubits`` sequence. For
example, ``qubits=[2, 5]`` means matrix rows/columns are ordered as
``|q2 q5>``.
"""
qubits = tuple(int(qubit) for qubit in qubits)
if len(set(qubits)) != len(qubits):
raise ValueError("Dense observable qubits must be unique.")
matrix = np.asarray(matrix, dtype=np.complex128)
dim = 2 ** len(qubits)
if matrix.shape != (dim, dim):
raise ValueError(
"Dense observable matrix shape must be "
f"({dim}, {dim}) for {len(qubits)} qubits."
)
units = [
np.array([[1, 0], [0, 0]], dtype=np.complex128),
np.array([[0, 1], [0, 0]], dtype=np.complex128),
np.array([[0, 0], [1, 0]], dtype=np.complex128),
np.array([[0, 0], [0, 1]], dtype=np.complex128),
]
terms = []
for row in range(dim):
for col in range(dim):
value = complex(coeff) * complex(matrix[row, col])
if value == 0:
continue
ops = []
for offset, site in enumerate(qubits):
shift = len(qubits) - offset - 1
out_bit = (row >> shift) & 1
in_bit = (col >> shift) & 1
ops.append((site, units[2 * out_bit + in_bit]))
terms.append((value, tuple(ops)))
return terms
def _dense_observable_to_operator_terms(observable):
if not isinstance(observable, dict):
return None
if "matrix" in observable:
terms = [observable]
else:
terms = observable.get("dense_terms")
if terms is None:
raw_terms = observable.get("terms")
if not raw_terms or not any("matrix" in term for term in raw_terms):
return None
terms = raw_terms
operator_terms = []
for term in terms:
if "matrix" not in term:
raise ValueError("Dense observable terms must include a matrix.")
qubits = term.get("qubits", term.get("sites"))
if qubits is None:
raise ValueError("Dense observable terms must include qubits or sites.")
operator_terms.extend(
_dense_operator_to_product_terms(
term.get("coefficient", 1.0),
qubits,
term["matrix"],
)
)
return operator_terms
def _operator_terms_to_mpo(terms, nqubits):
"""Build an exact direct-sum MPO for product-operator terms.
This intentionally favors correctness and generality over compression: an
``m``-term sum becomes an MPO with bond dimension ``m``. Local Hamiltonians
can be compressed later without changing the public expectation path.
"""
identity = np.eye(2, dtype=np.complex128)
expanded_terms = []
for coeff, ops in terms:
local_ops = [identity for _ in range(nqubits)]
for site, matrix in ops:
site = int(site)
if site < 0 or site >= nqubits:
raise ValueError(f"Observable site {site} is outside the circuit.")
matrix = np.asarray(matrix, dtype=np.complex128)
if matrix.shape != (2, 2):
raise ValueError("Only qubit local operators with shape (2, 2) are supported.")
local_ops[site] = matrix
expanded_terms.append((complex(coeff), local_ops))
if not expanded_terms:
raise ValueError("Cannot build an MPO from an empty observable.")
bond_dim = len(expanded_terms)
mpo = []
for site in range(nqubits):
left_dim = 1 if site == 0 else bond_dim
right_dim = 1 if site == nqubits - 1 else bond_dim
tensor = np.zeros((left_dim, 2, 2, right_dim), dtype=np.complex128)
for term_index, (coeff, local_ops) in enumerate(expanded_terms):
left = 0 if site == 0 else term_index
right = 0 if site == nqubits - 1 else term_index
op = coeff * local_ops[site] if site == 0 else local_ops[site]
tensor[left, :, :, right] += op
mpo.append(tensor)
return mpo
def _observable_mpo_tensors(observable, nqubits=None):
if isinstance(observable, dict):
if "mpo_tensors" in observable:
return observable["mpo_tensors"]
if "mpo" in observable:
return observable["mpo"]
if nqubits is not None:
terms = _dense_observable_to_operator_terms(observable)
if terms is not None:
return _operator_terms_to_mpo(terms, nqubits)
return None
def _unsupported_reason(circuit):
for gate in circuit.queue:
name = getattr(gate, "name", gate.__class__.__name__)
@@ -71,6 +227,44 @@ def _can_route_non_adjacent(circuit):
return True
@dataclass
class _PreparedCircuit:
nqubits: int
queue: list
def _decompose_gate_for_mps(gate, nqubits, stack=()):
sites = _gate_sites(gate)
if len(sites) <= 2:
return [gate]
if gate in stack or not hasattr(gate, "decompose"):
name = getattr(gate, "name", gate.__class__.__name__)
raise ValueError(f"gate {name} acts on {len(sites)} qubits")
free = [qubit for qubit in range(nqubits) if qubit not in sites]
try:
decomposed = gate.decompose(*free, use_toffolis=False, method="standard")
except TypeError:
decomposed = gate.decompose(*free)
if not decomposed or decomposed == [gate]:
name = getattr(gate, "name", gate.__class__.__name__)
raise ValueError(f"gate {name} could not be decomposed for Vidal MPS")
result = []
for item in decomposed:
result.extend(_decompose_gate_for_mps(item, nqubits, stack + (gate,)))
return result
def _prepare_circuit_for_mps(circuit, decompose=True):
if not decompose:
return circuit
queue = []
for gate in circuit.queue:
queue.extend(_decompose_gate_for_mps(gate, circuit.nqubits))
return _PreparedCircuit(nqubits=circuit.nqubits, queue=queue)
@dataclass
class VidalBackend(QibotnBackend, NumpyBackend):
"""QiboTN backend using Vidal/TEBD when possible.
@@ -89,14 +283,16 @@ class VidalBackend(QibotnBackend, NumpyBackend):
self.name = "qibotn"
self.platform = "vidal"
self.precision = "double"
self.rank = 0
self.last_truncation_error = 0.0
self.last_max_truncation_error = 0.0
self.configure_tn_simulation()
def configure_tn_simulation(
self,
ansatz: str = "MPS",
max_bond_dimension: int = 10,
cut_ratio: float = 1e-9,
max_bond_dimension: int | None = 10,
cut_ratio: float | None = 1e-9,
trunc_tracking_mode: str = "C",
svd_control: str = "E!",
ini_bond_dimension: int = 1,
@@ -108,6 +304,7 @@ class VidalBackend(QibotnBackend, NumpyBackend):
mpi_num_procs: int = 1,
mpi_where_barriers: int = -1,
mpi_isometrization: int = -1,
mpi_term_batch_size: int | None = None,
fallback: bool = True,
):
self.ansatz = ansatz
@@ -124,6 +321,7 @@ class VidalBackend(QibotnBackend, NumpyBackend):
self.mpi_num_procs = mpi_num_procs
self.mpi_where_barriers = mpi_where_barriers
self.mpi_isometrization = mpi_isometrization
self.mpi_term_batch_size = mpi_term_batch_size
self.fallback = fallback
self._fallback_backend = None
@@ -157,10 +355,15 @@ class VidalBackend(QibotnBackend, NumpyBackend):
raise NotImplementedError(reason)
return self._qmatchatea_fallback()
def _preprocess_circuit(self, circuit, compile_circuit):
"""Decompose unsupported multi-qubit gates for the local Vidal path."""
return _prepare_circuit_for_mps(circuit, decompose=True)
def _run_fast_executor(self, circuit, compile_circuit=True):
if self.mpi_approach == "CT":
from mpi4py import MPI
self.rank = MPI.COMM_WORLD.Get_rank()
executor = SegmentVidalMPIExecutor(
nqubits=circuit.nqubits,
max_bond=self.max_bond_dimension,
@@ -169,6 +372,7 @@ class VidalBackend(QibotnBackend, NumpyBackend):
comm=MPI.COMM_WORLD,
)
else:
self.rank = 0
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
max_bond=self.max_bond_dimension,
@@ -183,9 +387,21 @@ class VidalBackend(QibotnBackend, NumpyBackend):
backend = self._fallback_or_raise("VidalBackend supports only MPS.")
return backend.expectation(circuit, observable, preprocess, compile_circuit)
original_circuit = circuit
if compile_circuit is None:
compile_circuit = self.compile_circuit
if preprocess:
try:
circuit = self._preprocess_circuit(circuit, compile_circuit)
except Exception as exc:
backend = self._fallback_or_raise(
f"VidalBackend preprocessing failed: {exc}"
)
return backend.expectation(
original_circuit, observable, preprocess, compile_circuit
)
reason = _unsupported_reason(circuit)
if reason is not None:
# Non-adjacent gates can be routed at compile time
@@ -193,40 +409,51 @@ class VidalBackend(QibotnBackend, NumpyBackend):
pass # proceed with Vidal + SWAP routing
else:
backend = self._fallback_or_raise(reason)
return backend.expectation(circuit, observable, preprocess, compile_circuit)
# preprocess=True requests qmatchatea-style transpilation to MPS topology.
# The fast path validates MPS-compatibility via _unsupported_reason above;
# if the circuit is already MPS-compatible, preprocessing is a no-op.
# If it isn't, _unsupported_reason already routed to fallback.
# So the fast path skips preprocessing — just warn once.
if preprocess:
import warnings
warnings.warn(
"VidalBackend fast path does not preprocess circuits. "
"If the circuit passes _unsupported_reason (all gates are "
"MPS-compatible), fast-path execution proceeds directly.",
stacklevel=2,
return backend.expectation(
original_circuit, observable, preprocess, compile_circuit
)
hamiltonian = check_observable(observable, circuit.nqubits)
try:
terms = _symbolic_hamiltonian_to_pauli_terms(hamiltonian)
except ValueError as exc:
backend = self._fallback_or_raise(str(exc))
return backend.expectation(circuit, observable, preprocess, compile_circuit)
executor = self._run_fast_executor(circuit, compile_circuit=compile_circuit)
self.last_truncation_error = float(executor.truncation_error)
self.last_truncation_error = float(
executor.global_truncation_error()
if hasattr(executor, "global_truncation_error")
else executor.truncation_error
)
self.last_max_truncation_error = float(
executor.global_max_truncation_error()
if hasattr(executor, "global_max_truncation_error")
else executor.max_truncation_error
)
mpo_tensors = _observable_mpo_tensors(observable, circuit.nqubits)
if mpo_tensors is not None:
if self.mpi_approach == "CT":
value = executor.expectation_pauli_sum_root(terms)
value = executor.expectation_mpo_root(mpo_tensors)
from qtealeaves.tooling.mpisupport import MPI
if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
return np.nan
return np.real(value)
return np.real(executor.expectation_pauli_sum(terms))
return value
return executor.expectation_mpo(mpo_tensors)
hamiltonian = check_observable(observable, circuit.nqubits)
try:
terms = _symbolic_hamiltonian_to_operator_terms(hamiltonian)
except ValueError as exc:
backend = self._fallback_or_raise(str(exc))
return backend.expectation(
original_circuit, observable, preprocess, compile_circuit
)
mpo_tensors = _operator_terms_to_mpo(terms, circuit.nqubits)
if self.mpi_approach == "CT":
value = executor.expectation_mpo_root(mpo_tensors)
from qtealeaves.tooling.mpisupport import MPI
if MPI is not None and MPI.COMM_WORLD.Get_rank() != 0:
return np.nan
return value
return executor.expectation_mpo(mpo_tensors)
def execute_circuit(
self,

View File

@@ -23,6 +23,7 @@ from qibotn.backends.vidal_tebd import (
_is_two_qubit_batch,
_make_two_site_update,
_ones,
_real_if_close,
_route_non_adjacent_gates,
_svd,
_tensor_update_from_numpy,
@@ -35,6 +36,8 @@ from qibotn.backends.vidal_tebd import (
_EDGE_TAG = 1701
_UPDATE_TAG = 1702
_EXPECT_ENV_TAG = 1703
_EXPECT_RESULT_TAG = 1704
def _partition_sites(nsites, nranks):
@@ -49,9 +52,9 @@ def _partition_sites(nsites, nranks):
@dataclass
class SegmentVidalMPIExecutor:
nqubits: int
max_bond: int
max_bond: int | None
comm: object
cut_ratio: float = 1e-12
cut_ratio: float | None = 1e-12
tensor_module: str = "torch"
def __post_init__(self):
@@ -63,6 +66,10 @@ class SegmentVidalMPIExecutor:
if self.start == self.end:
raise ValueError("SegmentVidalMPIExecutor requires at least one site per rank.")
from qibotn.backends.cpu import _bind_numa_node
self.numa_domain = _bind_numa_node(self.rank)
self.xp = _backend_module(self.tensor_module)
if self.xp is np:
self.dtype = np.complex128
@@ -82,11 +89,22 @@ class SegmentVidalMPIExecutor:
for bond in range(self.start, self.end + 1)
}
self._accumulated_truncation_error = 0.0
self._max_truncation_error = 0.0
@property
def truncation_error(self):
return self._accumulated_truncation_error
def global_truncation_error(self):
return self.comm.allreduce(self._accumulated_truncation_error, op=MPI.SUM)
@property
def max_truncation_error(self):
return self._max_truncation_error
def global_max_truncation_error(self):
return self.comm.allreduce(self._max_truncation_error, op=MPI.MAX)
def owns_site(self, site):
return self.start <= site < self.end
@@ -270,7 +288,12 @@ class SegmentVidalMPIExecutor:
def _install_update(self, update):
if isinstance(update["left"], np.ndarray):
update = _tensor_update_from_numpy(self.xp, update, self.dtype)
self._accumulated_truncation_error += update.get("truncation_error", 0.0)
truncation_error = update.get("truncation_error", 0.0)
self._accumulated_truncation_error += truncation_error
self._max_truncation_error = max(
self._max_truncation_error,
truncation_error,
)
site = update["site"]
if self.owns_site(site):
self.gammas[site] = update["left"]
@@ -288,26 +311,189 @@ class SegmentVidalMPIExecutor:
}
return self.comm.gather(payload, root=0)
def expectation_pauli_sum_root(self, terms):
payloads = self.gather_full_state()
if self.rank != 0:
def expectation_pauli_sum_root(self, terms, term_batch_size=None):
paulis = {
"I": self._eye(2),
"X": _asarray(self.xp, [[0, 1], [1, 0]], self.dtype),
"Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
"Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
}
operator_terms = [
(
coeff,
tuple((site, paulis[name.upper()]) for name, site in ops),
)
for coeff, ops in terms
]
return self.expectation_operator_sum_root(
operator_terms,
term_batch_size=term_batch_size,
)
def expectation_operator_sum_root(self, terms, term_batch_size=None):
if term_batch_size is None:
term_batch_size = max(1, len(terms))
norm = self._distributed_product_expectation({})
total = 0.0 + 0.0j
for start in range(0, len(terms), int(term_batch_size)):
batch = terms[start : start + int(term_batch_size)]
values = self._distributed_operator_batch_expectation(batch, norm)
if self.rank == 0:
for (coeff, _), term_value in zip(batch, values):
total += complex(coeff) * complex(term_value)
return None if self.rank != 0 else _real_if_close(total / norm)
def _eye(self, size):
if self.xp is np:
return np.eye(size, dtype=self.dtype)
return self.xp.eye(size, dtype=self.dtype, device=self.device)
def _distributed_product_expectation(self, operators):
if self.rank == 0:
env = self._segment_product_environment(operators)
if self.size == 1:
return env.reshape(-1)[0]
self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
env = self._segment_product_environment(operators, incoming)
if self.rank == self.size - 1:
self.comm.send(_to_numpy(env).reshape(-1)[0], dest=0, tag=_EXPECT_RESULT_TAG)
else:
self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
return None
full = VidalTEBDExecutor(
self.nqubits,
self.max_bond,
cut_ratio=self.cut_ratio,
tensor_module=self.tensor_module,
)
for payload in payloads:
for site, tensor in payload["gammas"].items():
full.gammas[int(site)] = _asarray(self.xp, tensor, self.dtype)
for bond, tensor in payload["lambdas"].items():
full.lambdas[int(bond)] = _asarray(
def _segment_product_environment(self, operators, incoming=None):
if incoming is None:
env = _asarray(
self.xp,
tensor,
self.xp.float64 if self.xp is not np else np.float64,
np.eye(len(self.lambdas[self.start]), dtype=np.complex128),
self.dtype,
)
return full.expectation_pauli_sum(terms)
else:
env = _asarray(self.xp, incoming, self.dtype)
identity = self._eye(2)
for site in range(self.start, self.end):
tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
op = operators.get(site, identity)
env = self.xp.einsum(
"xy,xsb,st,ytd->bd", env, self._conj(tensor), op, tensor
)
return env
def _distributed_operator_batch_expectation(self, terms, norm):
if not terms:
return []
if all(not ops for _, ops in terms):
return [norm] * len(terms) if self.rank == 0 else None
batch_ops = [
{int(site): _asarray(self.xp, matrix, self.dtype) for site, matrix in ops}
for _, ops in terms
]
if self.rank == 0:
env = self._segment_operator_batch_environment(batch_ops)
if self.size == 1:
return list(env.reshape(len(terms), -1)[:, 0])
self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
return self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
env = self._segment_operator_batch_environment(batch_ops, incoming)
if self.rank == self.size - 1:
values = list(_to_numpy(env).reshape(len(terms), -1)[:, 0])
self.comm.send(values, dest=0, tag=_EXPECT_RESULT_TAG)
else:
self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
return None
def _segment_operator_batch_environment(self, batch_ops, incoming=None):
batch_size = len(batch_ops)
if incoming is None:
dim = len(self.lambdas[self.start])
env = _asarray(
self.xp,
np.tile(np.eye(dim, dtype=np.complex128), (batch_size, 1, 1)),
self.dtype,
)
else:
env = _asarray(self.xp, incoming, self.dtype)
identity = self._eye(2)
for site in range(self.start, self.end):
tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
ops = self.xp.stack(
[operators.get(site, identity) for operators in batch_ops],
axis=0,
)
env = self.xp.einsum(
"nxy,xsb,nst,ytd->nbd",
env,
self._conj(tensor),
ops,
tensor,
)
return env
def _conj(self, tensor):
return np.conjugate(tensor) if self.xp is np else tensor.conj()
def expectation_mpo_root(self, mpo_tensors):
if len(mpo_tensors) != self.nqubits:
raise ValueError(
f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}."
)
norm = self._distributed_product_expectation({})
if self.rank == 0:
env = self._segment_mpo_environment(mpo_tensors)
if self.size == 1:
return _real_if_close(env.reshape(-1)[0] / norm)
self.comm.send(_to_numpy(env), dest=1, tag=_EXPECT_ENV_TAG)
value = self.comm.recv(source=self.size - 1, tag=_EXPECT_RESULT_TAG)
return _real_if_close(value / norm)
incoming = self.comm.recv(source=self.rank - 1, tag=_EXPECT_ENV_TAG)
env = self._segment_mpo_environment(mpo_tensors, incoming)
if self.rank == self.size - 1:
self.comm.send(
_to_numpy(env).reshape(-1)[0],
dest=0,
tag=_EXPECT_RESULT_TAG,
)
else:
self.comm.send(_to_numpy(env), dest=self.rank + 1, tag=_EXPECT_ENV_TAG)
return None
def _segment_mpo_environment(self, mpo_tensors, incoming=None):
if incoming is None:
left_dim = len(self.lambdas[self.start])
env = _asarray(
self.xp,
np.zeros((left_dim, 1, left_dim), dtype=np.complex128),
self.dtype,
)
env[:, 0, :] = self._eye(left_dim)
else:
env = _asarray(self.xp, incoming, self.dtype)
for site in range(self.start, self.end):
mpo = _asarray(self.xp, mpo_tensors[site], self.dtype)
if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2):
raise ValueError(
"Each MPO tensor must have shape "
"(left_bond, 2, 2, right_bond)."
)
tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
env = self.xp.einsum(
"xlc,xub,lutr,ctd->brd",
env,
self._conj(tensor),
mpo,
tensor,
)
return env
def expectation_ring_xz_root(self):
terms = [

View File

@@ -50,10 +50,10 @@ def _transpose(xp, tensor, axes):
return np.transpose(tensor, axes) if xp is np else tensor.permute(*axes)
def _vdot_real(xp, left, right):
def _vdot(xp, left, right):
if xp is np:
return np.vdot(left.reshape(-1), right.reshape(-1)).real
return xp.vdot(left.reshape(-1), right.reshape(-1)).real
return np.vdot(left.reshape(-1), right.reshape(-1))
return xp.vdot(left.reshape(-1), right.reshape(-1))
def _to_float(x):
@@ -62,6 +62,19 @@ def _to_float(x):
return float(x)
def _to_scalar(x):
if hasattr(x, "detach"):
return x.detach().cpu().item()
if isinstance(x, np.ndarray):
return x.item()
return x
def _real_if_close(x, tol=1000):
value = np.real_if_close(x, tol=tol)
return value.item() if isinstance(value, np.ndarray) else value
def _to_numpy(tensor):
if hasattr(tensor, "detach"):
return tensor.detach().cpu().numpy()
@@ -154,8 +167,8 @@ def _safe_inverse(xp, values):
@dataclass
class VidalTEBDExecutor:
nqubits: int
max_bond: int
cut_ratio: float = 1e-12
max_bond: int | None
cut_ratio: float | None = 1e-12
tensor_module: str = "torch"
def __post_init__(self):
@@ -175,6 +188,7 @@ class VidalTEBDExecutor:
_ones(self.xp, 1, self.dtype, self.device) for _ in range(self.nqubits + 1)
]
self._accumulated_truncation_error = 0.0
self._max_truncation_error = 0.0
def run_circuit(self, circuit, compile_circuit=True):
gates = circuit.queue
@@ -189,6 +203,10 @@ class VidalTEBDExecutor:
def truncation_error(self):
return self._accumulated_truncation_error
@property
def max_truncation_error(self):
return self._max_truncation_error
def _apply_gate(self, gate):
sites = _gate_sites(gate)
matrix = _asarray(self.xp, gate.matrix(), self.dtype)
@@ -232,6 +250,10 @@ class VidalTEBDExecutor:
update = _make_two_site_update(item, umat, singvals, vh,
self.max_bond, self.cut_ratio, self.xp)
self._accumulated_truncation_error += update["truncation_error"]
self._max_truncation_error = max(
self._max_truncation_error,
update["truncation_error"],
)
i = update["site"]
self.gammas[i] = update["left"]
self.gammas[i + 1] = update["right"]
@@ -252,25 +274,37 @@ class VidalTEBDExecutor:
"Y": _asarray(self.xp, [[0, -1j], [1j, 0]], self.dtype),
"Z": _asarray(self.xp, [[1, 0], [0, -1]], self.dtype),
}
value = 0.0
operator_terms = [
(
coeff,
tuple((site, paulis[name.upper()]) for name, site in ops),
)
for coeff, ops in terms
]
return self.expectation_operator_sum(operator_terms)
def expectation_operator_sum(self, terms):
value = 0.0 + 0.0j
norm = self.norm()
for coeff, ops in terms:
ops = tuple((name.upper(), site) for name, site in ops)
operators = {
int(site): _asarray(self.xp, matrix, self.dtype)
for site, matrix in ops
}
if len(ops) == 0:
term_value = norm
elif len(ops) == 1:
name, site = ops[0]
term_value = _to_float(self._expect_one_site(site, paulis[name]))
elif len(ops) == 2 and abs(ops[0][1] - ops[1][1]) == 1:
(name0, site0), (name1, site1) = sorted(ops, key=lambda item: item[1])
term_value = _to_float(
self._expect_adjacent(site0, paulis[name0], paulis[name1])
elif len(operators) == 1:
site, matrix = next(iter(operators.items()))
term_value = _to_scalar(self._expect_one_site(site, matrix))
elif len(operators) == 2 and abs(max(operators) - min(operators)) == 1:
site0, site1 = sorted(operators)
term_value = _to_scalar(
self._expect_adjacent(site0, operators[site0], operators[site1])
)
else:
operators = {site: paulis[name] for name, site in ops}
term_value = _to_float(self.expect_product_operators(operators))
value += float(np.real(coeff)) * term_value
return value / norm
term_value = _to_scalar(self.expect_product_operators(operators))
value += complex(coeff) * complex(term_value)
return _real_if_close(value / norm)
def _expect_one_site(self, site, op):
theta = self.xp.einsum(
@@ -280,7 +314,7 @@ class VidalTEBDExecutor:
self.lambdas[site + 1],
)
op_theta = self.xp.einsum("us,asb->aub", op, theta)
return _vdot_real(self.xp, theta, op_theta)
return _vdot(self.xp, theta, op_theta)
def _expect_adjacent(self, site, op_left, op_right):
theta = self.xp.einsum(
@@ -292,7 +326,7 @@ class VidalTEBDExecutor:
self.lambdas[site + 2],
)
op_theta = self.xp.einsum("us,vt,astc->auvc", op_left, op_right, theta)
return _vdot_real(self.xp, theta, op_theta)
return _vdot(self.xp, theta, op_theta)
def expect_product_operators(self, operators):
env = _asarray(self.xp, [[1.0 + 0.0j]], self.dtype)
@@ -303,10 +337,38 @@ class VidalTEBDExecutor:
env = self.xp.einsum(
"xy,xsb,st,ytd->bd", env, _conj(self.xp, tensor), op, tensor
)
return env.reshape(-1)[0].real
return env.reshape(-1)[0]
def norm(self):
return _to_float(self.expect_product_operators({}))
return float(np.real(_to_scalar(self.expect_product_operators({}))))
def expectation_mpo(self, mpo_tensors):
"""Compute ``<psi|MPO|psi> / <psi|psi>``.
MPO tensors are expected in ``(left_bond, phys_out, phys_in, right_bond)``
order, with physical dimension 2 on every site.
"""
if len(mpo_tensors) != self.nqubits:
raise ValueError(
f"Expected {self.nqubits} MPO tensors, got {len(mpo_tensors)}."
)
env = _asarray(self.xp, [[[1.0 + 0.0j]]], self.dtype)
for site, raw_mpo in enumerate(mpo_tensors):
mpo = _asarray(self.xp, raw_mpo, self.dtype)
if mpo.ndim != 4 or mpo.shape[1:3] != (2, 2):
raise ValueError(
"Each MPO tensor must have shape "
"(left_bond, 2, 2, right_bond)."
)
tensor = self.gammas[site] * self.lambdas[site + 1].reshape(1, 1, -1)
env = self.xp.einsum(
"xlc,xub,lutr,ctd->brd",
env,
_conj(self.xp, tensor),
mpo,
tensor,
)
return _real_if_close(_to_scalar(env.reshape(-1)[0]) / self.norm())
def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, gamma_right):
@@ -328,8 +390,8 @@ def _build_theta_svd_matrix(op, xp, lam_left, lam_mid, lam_right, gamma_left, ga
def _choose_bond(singvals, max_bond, cut_ratio, xp):
max_possible = int(singvals.shape[0])
keep = min(max_possible, max_bond)
if cut_ratio > 0 and max_possible > 0:
keep = max_possible if max_bond is None else min(max_possible, int(max_bond))
if cut_ratio is not None and cut_ratio > 0 and max_possible > 0:
threshold = singvals[0] * cut_ratio
if xp is np:
ratio_keep = int(np.count_nonzero(singvals > threshold))
@@ -415,8 +477,8 @@ class _RoutedTwoQubitGate:
name = "routed_two_qubit"
control_qubits = ()
def __init__(self, original_gate, left_phys, right_phys):
self.target_qubits = (left_phys, right_phys)
def __init__(self, original_gate, physical_sites):
self.target_qubits = tuple(physical_sites)
self._matrix = original_gate.matrix()
def matrix(self):
@@ -447,8 +509,10 @@ def _route_non_adjacent_gates(gates, nqubits):
for pos in range(right - 1, left, -1):
routed.append(_SWAPGate(pos, pos + 1))
# Apply the original gate at the adjacent physical positions
routed.append(_RoutedTwoQubitGate(gate, left, left + 1))
# Apply the original gate in its original qubit order. For gates like
# CNOT(5, 0), sorting the routed sites would swap control and target.
physical_map = {left: left, right: left + 1}
routed.append(_RoutedTwoQubitGate(gate, [physical_map[site] for site in sites]))
# Reverse SWAPs to restore original ordering
for pos in range(left + 1, right):

View File

@@ -1,6 +1,19 @@
import cupy as cp
import numpy as np
try:
import cupy as cp
except ImportError: # pragma: no cover - exercised on CPU-only installations
cp = None
def _require_cupy():
if cp is None:
raise ImportError(
"The cuQuantum circuit converter requires cupy. "
"Install the GPU dependencies or use the CPU backend."
)
return cp
# Reference: https://github.com/NVIDIA/cuQuantum/tree/main/python/samples/cutensornet/circuit_converter
@@ -19,7 +32,7 @@ class QiboCircuitToEinsum:
"""
def __init__(self, circuit, dtype="complex128"):
self.backend = cp
self.backend = _require_cupy()
self.dtype = getattr(self.backend, dtype)
self.init_basis_map(self.backend, dtype)
self.init_intermediate_circuit(circuit)
@@ -116,7 +129,9 @@ class QiboCircuitToEinsum:
required_shape = self.op_shape_from_qubits(len(gate_qubits))
self.gate_tensors.append(
(
cp.asarray(gate.matrix(), dtype=self.dtype).reshape(required_shape),
self.backend.asarray(gate.matrix(), dtype=self.dtype).reshape(
required_shape
),
gate_qubits,
)
)
@@ -161,7 +176,7 @@ class QiboCircuitToEinsum:
required_shape = self.op_shape_from_qubits(len(gate_qubits))
self.gate_tensors_inverse.append(
(
cp.asarray(gate.matrix()).reshape(required_shape),
self.backend.asarray(gate.matrix()).reshape(required_shape),
gate_qubits,
)
)
@@ -169,7 +184,7 @@ class QiboCircuitToEinsum:
# self.active_qubits is to identify qubits with at least 1 gate acting on it in the whole circuit.
self.active_qubits_inverse = np.unique(gates_qubits_inverse)
def get_pauli_gates(self, pauli_map, dtype="complex128", backend=cp):
def get_pauli_gates(self, pauli_map, dtype="complex128", backend=None):
"""Populate the gates for all pauli operators.
Parameters:
@@ -180,6 +195,8 @@ class QiboCircuitToEinsum:
Returns:
A sequence of pauli gates.
"""
if backend is None:
backend = _require_cupy()
asarray = backend.asarray
pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)

View File

@@ -1,10 +1,23 @@
import cupy as cp
import cuquantum.bindings.cutensornet as cutn
import numpy as np
from qibotn.circuit_convertor import QiboCircuitToEinsum
from qibotn.mps_utils import apply_gate, initial
try:
import cupy as cp
import cuquantum.bindings.cutensornet as cutn
except ImportError: # pragma: no cover - exercised on CPU-only installations
cp = None
cutn = None
def _require_cuquantum():
if cp is None or cutn is None:
raise ImportError(
"The cuQuantum MPS converter requires cupy and cuquantum. "
"Install the GPU dependencies or use the CPU backend."
)
class QiboCircuitToMPS:
"""A helper class to convert Qibo circuit to MPS.
@@ -23,6 +36,7 @@ class QiboCircuitToMPS:
dtype="complex128",
rand_seed=0,
):
_require_cuquantum()
np.random.seed(rand_seed)
cp.random.seed(rand_seed)
@@ -44,4 +58,6 @@ class QiboCircuitToMPS:
)
def __del__(self):
cutn.destroy(self.handle)
handle = getattr(self, "handle", None)
if cutn is not None and handle is not None:
cutn.destroy(handle)

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,3 @@
import cupy as cp
import cuquantum.bindings.cutensornet as cutn
from cupy.cuda import nccl
from cupy.cuda.runtime import getDeviceCount
from cuquantum.tensornet import Network, contract
from mpi4py import MPI
from qibotn.circuit_convertor import QiboCircuitToEinsum
@@ -15,8 +10,37 @@ from qibotn.observables import (
extract_gates_and_qubits,
)
try:
import cupy as cp
import cuquantum.bindings.cutensornet as cutn
from cupy.cuda import nccl
from cupy.cuda.runtime import getDeviceCount
from cuquantum.tensornet import Network, contract
except ImportError: # pragma: no cover - exercised on CPU-only installations
cp = None
cutn = None
nccl = None
getDeviceCount = None
Network = None
contract = None
def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
def _require_cuquantum():
if (
cp is None
or cutn is None
or nccl is None
or getDeviceCount is None
or Network is None
or contract is None
):
raise ImportError(
"The legacy GPU evaluation helpers require cupy and cuquantum. "
"Install the GPU dependencies or use the CPU backend."
)
def get_ham_gates(pauli_map, dtype="complex128", backend=None):
"""Populate the gates for all pauli operators.
Parameters:
@@ -27,6 +51,13 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
Returns:
A sequence of pauli gates.
"""
if backend is None:
backend = cp
if backend is None:
raise ImportError(
"get_ham_gates requires an array backend; cupy is unavailable "
"in this CPU-only environment."
)
asarray = backend.asarray
pauli_i = asarray([[1, 0], [0, 1]], dtype=dtype)
pauli_x = asarray([[0, 1], [1, 0]], dtype=dtype)
@@ -46,6 +77,7 @@ def get_ham_gates(pauli_map, dtype="complex128", backend=cp):
def initialize_mpi():
"""Initialize MPI communication and device selection."""
_require_cuquantum()
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
@@ -56,6 +88,7 @@ def initialize_mpi():
def initialize_nccl(comm_mpi, rank, size):
"""Initialize NCCL communication."""
_require_cuquantum()
nccl_id = nccl.get_unique_id() if rank == 0 else None
nccl_id = comm_mpi.bcast(nccl_id, root=0)
return nccl.NcclCommunicator(size, nccl_id, rank)
@@ -73,6 +106,7 @@ def get_operands(qibo_circ, datatype, rank, comm):
def compute_optimal_path(network, n_samples, size, comm):
"""Compute contraction path and broadcast optimal selection."""
_require_cuquantum()
path, info = network.contract_path(
optimize={
"samples": n_samples,
@@ -101,6 +135,8 @@ def compute_slices(info, rank, size):
def reduce_result(result, comm, method="MPI", root=0):
"""Reduce results across processes."""
if method == "NCCL":
_require_cuquantum()
if method == "MPI":
return comm.reduce(sendobj=result, op=MPI.SUM, root=root)
@@ -148,6 +184,7 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8):
Returns:
Dense vector of quantum circuit.
"""
_require_cuquantum()
comm, rank, size, device_id = initialize_mpi()
operands = get_operands(qibo_circ, datatype, rank, comm)
network = Network(*operands, options={"device_id": device_id})
@@ -179,6 +216,7 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8):
Returns:
Dense vector of quantum circuit.
"""
_require_cuquantum()
comm_mpi, rank, size, device_id = initialize_mpi()
comm_nccl = initialize_nccl(comm_mpi, rank, size)
operands = get_operands(qibo_circ, datatype, rank, comm_mpi)
@@ -203,6 +241,7 @@ def dense_vector_tn(qibo_circ, datatype):
Returns:
Dense vector of quantum circuit.
"""
_require_cuquantum()
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
return contract(*myconvertor.state_vector_operands())
@@ -231,6 +270,7 @@ def expectation_tn_nccl(qibo_circ, datatype, observable, n_samples=8):
Expectation of quantum circuit due to pauli string.
"""
_require_cuquantum()
comm_mpi, rank, size, device_id = initialize_mpi()
comm_nccl = initialize_nccl(comm_mpi, rank, size)
@@ -299,6 +339,7 @@ def expectation_tn_MPI(qibo_circ, datatype, observable, n_samples=8):
Returns:
Expectation of quantum circuit due to pauli string.
"""
_require_cuquantum()
# Initialize MPI and device
comm, rank, size, device_id = initialize_mpi()
@@ -358,6 +399,7 @@ def expectation_tn(qibo_circ, datatype, observable):
Returns:
Expectation of quantum circuit due to pauli string.
"""
_require_cuquantum()
myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype)
observable = check_observable(observable, qibo_circ.nqubits)
@@ -383,6 +425,7 @@ def dense_vector_mps(qibo_circ, gate_algo, datatype):
Returns:
Dense vector of quantum circuit.
"""
_require_cuquantum()
myconvertor = QiboCircuitToMPS(qibo_circ, gate_algo, dtype=datatype)
mps_helper = MPSContractionHelper(myconvertor.num_qubits)

View File

@@ -16,9 +16,11 @@ from qibotn.observables import check_observable
class ExpectationConfig:
ansatz: str = "tn"
mpi: bool = False
bond: int = 1024
cut_ratio: float = 1e-12
bond: int | None = 1024
cut_ratio: float | None = 1e-12
tensor_module: str = "torch"
quimb_backend: str = "torch"
dtype: str = "complex128"
torch_threads: int = 8
parallel_opts: dict | None = None
@@ -28,6 +30,7 @@ class ExpectationResult:
value: float
seconds: float
rank: int = 0
parallel_stats: list | None = None
def exact_for_observable(circuit, observable, nqubits):
@@ -54,6 +57,8 @@ def run_cpu_expectation(circuit, observable, config):
"max_bond_dimension": config.bond,
"cut_ratio": config.cut_ratio,
"tensor_module": config.tensor_module,
"quimb_backend": config.quimb_backend,
"dtype": config.dtype,
"torch_threads": config.torch_threads,
"parallel_opts": config.parallel_opts or {},
}
@@ -68,4 +73,10 @@ def run_cpu_expectation(circuit, observable, config):
elapsed = time.perf_counter() - start
rank = getattr(backend, "rank", 0)
return ExpectationResult(float(np.real(value)), elapsed, rank=rank)
stats = getattr(backend, "parallel_stats", None)
return ExpectationResult(
float(np.real(value)),
elapsed,
rank=rank,
parallel_stats=list(stats) if stats is not None else None,
)

View File

@@ -1,4 +1,16 @@
try:
from cuquantum.tensornet import contract, contract_path
except ImportError: # pragma: no cover - exercised on CPU-only installations
contract = None
contract_path = None
def _require_cuquantum():
if contract is None or contract_path is None:
raise ImportError(
"The cuQuantum MPS contraction helper requires cuquantum. "
"Install the GPU dependencies or use the CPU backend."
)
# Reference: https://github.com/NVIDIA/cuQuantum/blob/main/python/samples/cutensornet/tn_algorithms/mps_algorithms.ipynb
@@ -113,6 +125,7 @@ class MPSContractionHelper:
return self._contract(interleaved_inputs, options=options) / norm
def _contract(self, interleaved_inputs, options=None):
_require_cuquantum()
path = contract_path(*interleaved_inputs, options=options)[0]
return contract(*interleaved_inputs, options=options, optimize={"path": path})

View File

@@ -1,6 +1,19 @@
try:
import cupy as cp
from cuquantum.tensornet import contract
from cuquantum.tensornet.experimental import contract_decompose
except ImportError: # pragma: no cover - exercised on CPU-only installations
cp = None
contract = None
contract_decompose = None
def _require_cuquantum():
if cp is None or contract is None or contract_decompose is None:
raise ImportError(
"The cuQuantum MPS helpers require cupy and cuquantum. "
"Install the GPU dependencies or use the CPU backend."
)
def initial(num_qubits, dtype):
@@ -13,6 +26,7 @@ def initial(num_qubits, dtype):
Returns:
The initial MPS tensors.
"""
_require_cuquantum()
state_tensor = cp.asarray([1, 0], dtype=dtype).reshape(1, 2, 1)
mps_tensors = [state_tensor] * num_qubits
return mps_tensors
@@ -28,6 +42,7 @@ def mps_site_right_swap(mps_tensors, i, **kwargs):
Returns:
The updated MPS tensors.
"""
_require_cuquantum()
# contraction followed by QR decomposition
a, _, b = contract_decompose(
"ipj,jqk->iqj,jpk",
@@ -60,6 +75,7 @@ def apply_gate(mps_tensors, gate, qubits, **kwargs):
The updated MPS tensors.
"""
_require_cuquantum()
n_qubits = len(qubits)
if n_qubits == 1:
# single-qubit gate

View File

@@ -17,6 +17,196 @@ except ImportError:
SEARCH_METHODS = ("greedy", "kahypar", "kahypar-agglom", "spinglass")
_COTENGRA_DASK_PATCHED = False
_COTENGRA_DASK_SUBMIT_PATCHED = False
_DASK_TRIAL_DEBUG = False
def _optimizer_search_stats(opt):
scores = list(getattr(opt, "scores", ()))
finite_scores = [score for score in scores if np.isfinite(score)]
times = list(getattr(opt, "times", ()))
best = getattr(opt, "best", {}) or {}
return {
"completed_trials": len(scores),
"finite_trials": len(finite_scores),
"failed_trials": len(scores) - len(finite_scores),
"requested_trials": int(getattr(opt, "max_repeats", 0) or 0),
"trial_seconds_sum": float(sum(times)),
"best_score": float(best.get("score", float("inf"))),
"best_flops": float(best.get("flops", float("inf"))),
"best_write": float(best.get("write", float("inf"))),
"best_size": float(best.get("size", float("inf"))),
}
def _attach_search_stats(tree, opt):
try:
tree.qibotn_search_stats = _optimizer_search_stats(opt)
except Exception:
pass
return tree
def _dask_worker_slots(client):
info = client.scheduler_info(n_workers=-1)
workers = info.get("workers", {})
return workers, sum(int(w.get("nthreads", 1) or 1) for w in workers.values())
def _print_dask_worker_summary(client):
workers, slots = _dask_worker_slots(client)
by_host = {}
for worker in workers.values():
host = worker.get("host", "unknown")
by_host.setdefault(host, {"workers": 0, "threads": 0})
by_host[host]["workers"] += 1
by_host[host]["threads"] += int(worker.get("nthreads", 1) or 1)
print(
"qibotn_dask_workers "
f"workers={len(workers)} threads={slots} by_host={by_host}",
flush=True,
)
def _run_trial_with_debug(fn, *args, **kwargs):
import os
import socket
try:
from distributed import get_worker
worker = get_worker()
worker_address = worker.address
except Exception:
worker_address = "unknown"
method = kwargs.get("method", "unknown")
pid = os.getpid()
host = socket.gethostname()
print(
"qibotn_trial_start "
f"worker={worker_address} host={host} pid={pid} method={method}",
flush=True,
)
start = time.perf_counter()
try:
trial = fn(*args, **kwargs)
except Exception as exc:
elapsed = time.perf_counter() - start
print(
"qibotn_trial_error "
f"worker={worker_address} host={host} pid={pid} "
f"method={method} seconds={elapsed:.3f} error={exc!r}",
flush=True,
)
raise
elapsed = time.perf_counter() - start
print(
"qibotn_trial_done "
f"worker={worker_address} host={host} pid={pid} method={method} "
f"seconds={elapsed:.3f} score={trial.get('score', float('nan')):.6g} "
f"flops={trial.get('flops', float('nan')):.6g} "
f"size={trial.get('size', float('nan')):.6g}",
flush=True,
)
return trial
def _patch_cotengra_dask_submit(debug_trials=False):
global _COTENGRA_DASK_SUBMIT_PATCHED, _DASK_TRIAL_DEBUG
_DASK_TRIAL_DEBUG = bool(debug_trials)
if _COTENGRA_DASK_SUBMIT_PATCHED:
return
import cotengra.parallel as ctg_parallel
import cotengra.hyperoptimizers.hyper as hyper
original_submit = ctg_parallel.submit
def submit(pool, fn, *args, **kwargs):
backend = pool.__class__.__module__.split(".", 1)[0]
if _DASK_TRIAL_DEBUG and backend == "distributed":
return original_submit(
pool,
_run_trial_with_debug,
fn,
*args,
**kwargs,
)
return original_submit(pool, fn, *args, **kwargs)
ctg_parallel.submit = submit
hyper.submit = submit
_COTENGRA_DASK_SUBMIT_PATCHED = True
def _patch_cotengra_dask_as_completed():
"""Make cotengra 0.7.5 handle distributed.Future objects.
This cotengra release routes all parallel futures through
``concurrent.futures.as_completed()``, which does not accept dask
``distributed.Future`` instances. Keep cotengra's optimizer/reporting logic
intact and only swap the wait primitive when the futures are from dask.
"""
global _COTENGRA_DASK_PATCHED
if _COTENGRA_DASK_PATCHED:
return
from cotengra.hyperoptimizers.hyper import HyperOptimizer
def _get_and_report_next_future(self):
futures_map = {future: setting for setting, future in self._futures}
if not futures_map:
return {
"score": float("inf"),
"flops": float("inf"),
"write": float("inf"),
"size": float("inf"),
"time": 0.0,
}
future0 = next(iter(futures_map))
if future0.__class__.__module__.split(".", 1)[0] == "distributed":
from distributed import as_completed
deadline = getattr(self, "_qibotn_deadline", None)
timeout = None if deadline is None else max(0.0, deadline - time.time())
try:
future = next(iter(as_completed(futures_map, timeout=timeout)))
except TimeoutError:
for future in futures_map:
future.cancel()
self._futures = []
return {
"score": float("inf"),
"flops": float("inf"),
"write": float("inf"),
"size": float("inf"),
"time": 0.0,
}
else:
import concurrent.futures as _cf
future = next(_cf.as_completed(futures_map))
setting = futures_map[future]
self._futures = [(s, f) for s, f in self._futures if f is not future]
try:
trial = future.result()
except Exception:
trial = {
"score": float("inf"),
"flops": float("inf"),
"write": float("inf"),
"size": float("inf"),
"time": 0.0,
}
self._maybe_report_result(setting, trial)
return trial
HyperOptimizer._get_and_report_next_future = _get_and_report_next_future
_COTENGRA_DASK_PATCHED = True
def _search_chunk(
@@ -46,7 +236,7 @@ def _search_chunk(
**kwargs,
)
tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
return tree.combo_cost(factor=256), tree
return tree.combo_cost(factor=256), _attach_search_stats(tree, opt)
def _run_single_trial(tn_bytes, output_inds, seed, slicing_opts):
@@ -164,6 +354,8 @@ def _dask_search(
dask_address=None,
n_workers=None,
optlib=None,
debug_trials=False,
close_workers=False,
):
"""Run one centralized cotengra hyper-optimizer over a dask pool.
@@ -180,6 +372,9 @@ def _dask_search(
import cotengra as ctg
_patch_cotengra_dask_as_completed()
_patch_cotengra_dask_submit(debug_trials=debug_trials)
close_client = False
close_cluster = False
cluster = None
@@ -205,7 +400,20 @@ def _dask_search(
if optlib is not None:
kwargs["optlib"] = optlib
retire_workers = []
try:
workers, worker_slots = _dask_worker_slots(client)
if close_workers:
retire_workers = list(workers)
if debug_trials:
_print_dask_worker_summary(client)
if total_repeats < worker_slots:
print(
"qibotn_dask_underutilized "
f"requested_trials={total_repeats} worker_slots={worker_slots} "
"hint='increase --tn-search-repeats to at least worker_slots'",
flush=True,
)
opt = ctg.HyperOptimizer(
methods=SEARCH_METHODS,
max_repeats=total_repeats,
@@ -216,8 +424,31 @@ def _dask_search(
progbar=False,
**kwargs,
)
return tn.contraction_tree(optimize=opt, output_inds=output_inds)
opt._num_workers = max(1, worker_slots)
opt.pre_dispatch = max(1, min(int(total_repeats), worker_slots))
if max_time is not None:
opt._qibotn_deadline = time.time() + max_time
tree = tn.contraction_tree(optimize=opt, output_inds=output_inds)
return _attach_search_stats(tree, opt)
finally:
if close_workers and retire_workers:
try:
retired = client.retire_workers(
workers=retire_workers,
close_workers=True,
remove=True,
)
print(
"qibotn_dask_workers_closed "
f"requested={len(retire_workers)} retired={len(retired)}",
flush=True,
)
except Exception as exc:
print(
"qibotn_dask_workers_close_failed "
f"requested={len(retire_workers)} error={exc!r}",
flush=True,
)
if close_client:
client.close()
if close_cluster:
@@ -234,6 +465,8 @@ def _mpi_search(
trial_timeout=None,
search_backend="processpool",
dask_address=None,
debug_trials=False,
dask_close_workers=False,
):
comm = MPI.COMM_WORLD
rank, size = comm.Get_rank(), comm.Get_size()
@@ -258,6 +491,8 @@ def _mpi_search(
slicing_opts=slicing_opts,
dask_address=dask_address,
n_workers=n_workers,
debug_trials=debug_trials,
close_workers=dask_close_workers,
)
payload = ("ok", tree)
except Exception as exc:
@@ -296,7 +531,8 @@ def _mpi_search(
def parallel_path_search(tn, output_inds, method='processpool', total_repeats=1024,
max_time=300, n_workers=48, slicing_opts=None,
trial_timeout=None, search_backend=None,
dask_address=None):
dask_address=None, debug_trials=False,
dask_close_workers=False):
"""Parallel contraction path search.
Args:
@@ -324,6 +560,8 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
trial_timeout,
search_backend=search_backend,
dask_address=dask_address,
debug_trials=debug_trials,
dask_close_workers=dask_close_workers,
)
elif method == 'processpool':
return _processpool_search(tn, output_inds, total_repeats, n_workers, max_time, slicing_opts, trial_timeout)
@@ -336,6 +574,8 @@ def parallel_path_search(tn, output_inds, method='processpool', total_repeats=10
slicing_opts=slicing_opts,
dask_address=dask_address,
n_workers=n_workers,
debug_trials=debug_trials,
close_workers=dask_close_workers,
)
else:
raise ValueError(f"Unknown method: {method}")
@@ -431,17 +671,37 @@ def _to_numpy_vector(value, is_torch):
def _zero_vector_like(arrays):
return np.zeros(1, dtype=np.complex128)
array = arrays[0]
if type(array).__module__.startswith("torch"):
return np.zeros(1, dtype=np.complex64 if "64" in str(array.dtype) else np.complex128)
return np.zeros(1, dtype=np.asarray(array).dtype)
def contract_tree_slices(tree, arrays, slice_indices, backend=None):
def contract_tree_slices(tree, arrays, slice_indices, backend=None, implementation=None):
"""Contract a subset of cotengra slices and return their local sum."""
backend = backend or _array_backend(arrays)
is_torch = backend == "torch"
local = None
cpp_contract = None
if implementation == "cpp":
if backend != "torch":
raise ValueError("implementation='cpp' requires torch arrays.")
from qibotn.torch_contractor import contract_tree_cpp
cpp_contract = contract_tree_cpp
for slice_id in slice_indices:
if cpp_contract is not None:
value = cpp_contract(tree, tree.slice_arrays(arrays, slice_id))
elif implementation is None:
value = tree.contract_slice(arrays, slice_id, backend=backend)
else:
value = tree.contract_slice(
arrays,
slice_id,
backend=backend,
implementation=implementation,
)
value = _to_numpy_vector(value, is_torch)
local = value if local is None else local + value
@@ -455,6 +715,7 @@ def parallel_contract(
comm=None,
assignment="block",
return_stats=False,
implementation=None,
):
if method == 'mpi':
if not _HAVE_MPI or comm is None:
@@ -465,11 +726,20 @@ def parallel_contract(
comm,
assignment=assignment,
return_stats=return_stats,
implementation=implementation,
)
raise ValueError(f"Unknown method: {method}")
def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=False):
def _contract_mpi(
tree,
arrays,
comm,
root=0,
assignment="block",
return_stats=False,
implementation=None,
):
rank, size = comm.Get_rank(), comm.Get_size()
backend = _array_backend(arrays)
is_torch = backend == "torch"
@@ -482,15 +752,14 @@ def _contract_mpi(tree, arrays, comm, root=0, assignment="block", return_stats=F
"appear in the output."
)
if nslices <= 1:
if rank != root:
return (None, stats) if return_stats else None
result = tree.contract(arrays, backend=backend)
result = _to_numpy_vector(result, is_torch)
return (result, stats) if return_stats else result
plan = mpi_slice_plan(nslices, rank, size, assignment=assignment)
local = contract_tree_slices(tree, arrays, plan.indices, backend=backend)
local = contract_tree_slices(
tree,
arrays,
plan.indices,
backend=backend,
implementation=implementation,
)
stats = SlicedContractStats(rank, size, nslices, len(plan.indices), assignment)
result = np.zeros_like(local) if rank == root else None

View File

@@ -0,0 +1,252 @@
"""Torch C++ contraction backend for cotengra trees.
This module compiles a restricted cotengra contraction tree into a compact
execution plan, then executes that plan in a C++ torch extension. It is an
experimental CPU path for reducing Python-level overhead between many
pairwise contractions.
"""
from __future__ import annotations
import importlib
import os
from functools import lru_cache
from pathlib import Path
from collections import defaultdict
_EXTENSION = None
_CONTRACTORS = {}
SMALL_GEMM_BATCH_FLOPS = 1_000_000
def _load_extension():
global _EXTENSION
if _EXTENSION is not None:
return _EXTENSION
from torch.utils.cpp_extension import load
source = Path(__file__).resolve().parent / "csrc" / "torch_contractor.cpp"
mklroot = os.environ.get("MKLROOT")
extra_cflags = ["-O3"]
extra_ldflags = []
extra_include_paths = []
if mklroot:
mklroot_path = Path(mklroot)
mkl_include = mklroot_path / "include"
mkl_lib = mklroot_path / "lib"
if (mkl_include / "mkl_cblas.h").exists() and (
(mkl_lib / "libmkl_rt.so").exists()
or (mkl_lib / "libmkl_rt.so.2").exists()
):
extra_cflags.append("-DQIBOTN_USE_MKL")
extra_include_paths.append(str(mkl_include))
extra_ldflags.extend([f"-L{mkl_lib}", "-lmkl_rt"])
_EXTENSION = load(
name="qibotn_torch_contractor",
sources=[str(source)],
extra_cflags=extra_cflags,
extra_ldflags=extra_ldflags,
extra_include_paths=extra_include_paths,
verbose=False,
)
return _EXTENSION
def _is_plain_permutation(expr):
if expr is None:
return None
if isinstance(expr, tuple):
return tuple(int(i) for i in expr)
if not isinstance(expr, str):
return None
if "," in expr or "->" not in expr:
return None
source, target = expr.split("->", 1)
if len(source) != len(target):
return None
if len(set(source)) != len(source) or set(source) != set(target):
return None
return tuple(source.index(ix) for ix in target)
def _maybe_tuple(values):
return () if values is None else tuple(int(x) for x in values)
def _shape_from_inds(tree, node):
return tuple(int(tree.size_dict[ix]) for ix in tree.get_inds(node))
def _matmul_signature(op):
kind = op[3]
if kind != 0:
return None
left_shape = op[5]
right_shape = op[7]
if len(left_shape) == 2 and len(right_shape) == 2:
m, k, n = left_shape[-2], left_shape[-1], right_shape[-1]
return ("mm", int(m), int(k), int(n), int(m * k * n))
return None
def _normalize_node_ids(tree, contractions):
leaf_to_id = {
frozenset((i,)): i
for i in range(tree.N)
}
next_id = len(leaf_to_id)
node_to_id = dict(leaf_to_id)
for parent, _left, _right, _tdot, _arg, _perm in contractions:
if parent not in node_to_id:
node_to_id[parent] = next_id
next_id += 1
return node_to_id, next_id
@lru_cache(maxsize=32)
def compile_torch_plan(tree):
"""Compile ``tree`` into C++ contractor plan fields.
The supported subset is the same pairwise matmul lowering used by
cotengra for torch CPU. Single-tensor diagonal/sum preprocessing is not
supported yet because it appears only in less common trees; callers should
fall back to cotengra for those cases.
"""
contract_mod = importlib.import_module("cotengra.contract")
contractions = contract_mod.extract_contractions(tree)
node_to_id, ntemps = _normalize_node_ids(tree, contractions)
plan = []
for parent, left, right, tdot, arg, perm in contractions:
if left is None or right is None:
raise NotImplementedError(
"C++ torch contractor does not support cotengra preprocessing."
)
left_shape = _shape_from_inds(tree, left)
right_shape = _shape_from_inds(tree, right)
if tdot:
parsed = contract_mod._parse_tensordot_axes_to_matmul(
arg,
left_shape,
right_shape,
)
else:
parsed = contract_mod._parse_eq_to_batch_matmul(
arg,
left_shape,
right_shape,
)
(
eq_a,
eq_b,
new_shape_a,
new_shape_b,
new_shape_ab,
perm_ab,
pure_multiplication,
) = parsed
left_perm = _is_plain_permutation(eq_a)
right_perm = _is_plain_permutation(eq_b)
if left_perm is None and eq_a is not None:
raise NotImplementedError(f"Unsupported left preparation: {eq_a!r}")
if right_perm is None and eq_b is not None:
raise NotImplementedError(f"Unsupported right preparation: {eq_b!r}")
plan.append(
(
node_to_id[parent],
node_to_id[left],
node_to_id[right],
1 if pure_multiplication else 0,
left_perm or (),
_maybe_tuple(new_shape_a),
right_perm or (),
_maybe_tuple(new_shape_b),
_maybe_tuple(new_shape_ab),
_maybe_tuple(perm_ab),
)
)
if perm is not None:
raise NotImplementedError(
"C++ torch contractor does not support cotengra tensordot perm."
)
root_id = node_to_id[tree.root]
return tuple(plan), int(ntemps), int(root_id)
@lru_cache(maxsize=32)
def compile_batch_groups(tree, max_flops=SMALL_GEMM_BATCH_FLOPS):
plan, _ntemps, _root_id = compile_torch_plan(tree)
contractions = importlib.import_module("cotengra.contract").extract_contractions(tree)
node_to_id, _ntemps = _normalize_node_ids(tree, contractions)
depth = {frozenset((i,)): 0 for i in range(tree.N)}
tensor_depth = {i: 0 for i in range(tree.N)}
groups = defaultdict(list)
for op_index, (contract_op, contraction) in enumerate(zip(plan, contractions)):
parent, left, right, _tdot, _arg, _perm = contraction
d = max(depth[left], depth[right]) + 1
depth[parent] = d
tensor_depth[contract_op[0]] = d
sig = _matmul_signature(contract_op)
if sig is None:
continue
kind, m, k, n, flops = sig
if flops > max_flops:
continue
groups[(d, kind, m, k, n)].append(op_index)
batch_groups = tuple(
tuple(items)
for _key, items in sorted(groups.items(), key=lambda item: (item[0], item[1][0]))
if len(items) >= 2
)
return batch_groups
def batch_group_summary(tree, max_flops=SMALL_GEMM_BATCH_FLOPS):
plan, _ntemps, _root_id = compile_torch_plan(tree)
groups = compile_batch_groups(tree, max_flops=max_flops)
covered = sum(len(group) for group in groups)
calls_saved = sum(len(group) - 1 for group in groups)
by_shape = []
for group in groups:
op = plan[group[0]]
sig = _matmul_signature(op)
by_shape.append((sig[1:4], len(group), group[:8]))
return {
"groups": len(groups),
"covered_ops": covered,
"calls_saved": calls_saved,
"by_shape": by_shape,
}
def contract_tree_cpp(tree, arrays):
"""Contract a cotengra tree using the experimental C++ torch contractor."""
contractor = prepare_torch_cpp_contractor(tree)
return contractor.contract(list(arrays))
def prepare_torch_cpp_contractor(tree):
"""Load the extension and compile ``tree`` without running contraction."""
ext = _load_extension()
key = id(tree)
contractor = _CONTRACTORS.get(key)
if contractor is None:
plan, ntemps, root_id = compile_torch_plan(tree)
contractor = ext.Contractor(list(plan), ntemps, root_id)
_CONTRACTORS[key] = contractor
return contractor

View File

@@ -5,7 +5,10 @@ from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Z
from qibotn.backends.cpu import CpuTensorNet
from qibotn.benchmark_cases import build_circuit as build_benchmark_circuit
from qibotn.benchmark_cases import (
build_circuit as build_benchmark_circuit,
exact_pauli_sum,
)
def build_circuit(nqubits=6):
@@ -109,6 +112,59 @@ def test_cpu_mps_sampling_uses_nshots():
assert set(result.frequencies()) <= {"0000", "1111"}
def test_cpu_mps_mpo_expectation_matches_statevector():
circuit = build_circuit(nqubits=4)
x = np.array([[0, 1], [1, 0]], dtype=complex)
z = np.array([[1, 0], [0, -1]], dtype=complex)
i2 = np.eye(2, dtype=complex)
mpo = [
x.reshape(1, 2, 2, 1),
z.reshape(1, 2, 2, 1),
i2.reshape(1, 2, 2, 1),
i2.reshape(1, 2, 2, 1),
]
exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": True,
"NCCL_enabled": False,
"expectation_enabled": {"mpo_tensors": mpo},
"max_bond_dimension": 64,
"tensor_module": "torch",
"torch_threads": 1,
}
)
value = backend.execute_circuit(circuit)[0]
assert math.isclose(value, exact, abs_tol=1e-12)
def test_cpu_mps_dense_observable_dict_matches_known_value():
circuit = Circuit(2)
circuit.add(gates.H(0))
circuit.add(gates.CNOT(0, 1))
bell = np.zeros((4, 4), dtype=complex)
bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5
backend = CpuTensorNet(
{
"MPI_enabled": False,
"MPS_enabled": True,
"NCCL_enabled": False,
"expectation_enabled": {"matrix": bell, "qubits": [0, 1]},
"max_bond_dimension": 16,
"tensor_module": "torch",
"torch_threads": 1,
}
)
value = backend.execute_circuit(circuit)[0]
assert math.isclose(value, 1.0, abs_tol=1e-12)
def test_cpu_generic_tn_long_pauli_string_matches_statevector():
circuit = build_benchmark_circuit("rxx_rzz", 10, 2, 42)
observable = {"pauli_string_pattern": "XZ"}

View File

@@ -2,10 +2,18 @@ import math
import numpy as np
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
from qibo.symbols import Symbol, X, Y, Z
from qibotn.backends.vidal import VidalBackend, _can_route_non_adjacent, _unsupported_reason
from qibotn.benchmark_cases import exact_pauli_sum
from qibotn.backends.vidal import (
VidalBackend,
_can_route_non_adjacent,
_unsupported_reason,
_operator_terms_to_mpo,
_symbolic_hamiltonian_to_operator_terms,
)
from qibotn.backends.vidal_tebd import (
VidalTEBDExecutor,
_route_non_adjacent_gates,
_gate_sites,
)
@@ -37,6 +45,25 @@ def test_vidal_backend_expectation_matches_statevector():
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_backend_accepts_unlimited_bond_and_no_cutoff():
circuit = build_local_circuit(nqubits=6, nlayers=2)
observable = hamiltonians.SymbolicHamiltonian(
form=0.5 * X(0) * Z(1) - 0.7 * Z(5)
)
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=None,
cut_ratio=None,
tensor_module="torch",
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=False)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_backend_fallback_for_non_adjacent_gate():
"""compile_circuit=False (default) → falls back to qmatchatea for non-adjacent."""
circuit = Circuit(4)
@@ -128,6 +155,208 @@ def test_routing_non_adjacent_cnot():
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_routing_preserves_reversed_non_adjacent_gate_order():
circuit = Circuit(6)
circuit.add(gates.X(5))
circuit.add(gates.H(0))
circuit.add(gates.CNOT(5, 0))
observable = hamiltonians.SymbolicHamiltonian(form=X(0) + Z(5) + Z(0) * Z(5))
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=64,
tensor_module="torch",
compile_circuit=True,
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=False)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_backend_preprocesses_non_adjacent_circuit():
circuit = Circuit(4)
circuit.add(gates.H(0))
circuit.add(gates.CNOT(0, 3))
observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=64,
tensor_module="torch",
compile_circuit=True,
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=True)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_backend_preprocesses_toffoli_locally():
circuit = Circuit(4)
circuit.add(gates.H(0))
circuit.add(gates.H(1))
circuit.add(gates.TOFFOLI(0, 1, 3))
observable = hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(3))
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=128,
tensor_module="torch",
compile_circuit=True,
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=True)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_expectation_preserves_complex_coefficients():
circuit = Circuit(1)
observable = hamiltonians.SymbolicHamiltonian(form=(1.0 + 2.0j) * Z(0))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=8,
tensor_module="torch",
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=False)
np.testing.assert_allclose(value, 1.0 + 2.0j, atol=1e-12)
def test_vidal_expectation_supports_custom_local_symbols():
circuit = build_local_circuit(nqubits=4, nlayers=2)
a0 = Symbol(0, np.array([[0.2, 1.0], [1.0, -0.3]], dtype=complex), name="A")
b2 = Symbol(2, np.array([[0.7, -0.4j], [0.4j, 0.1]], dtype=complex), name="B")
a3 = Symbol(3, np.array([[0.5, 0.2], [0.2, -0.8]], dtype=complex), name="A")
observable = hamiltonians.SymbolicHamiltonian(form=0.7 * a0 * b2 - 0.4 * a3)
exact = observable.expectation_from_state(circuit().state(numpy=True))
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=64,
tensor_module="torch",
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=False)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_executor_mpo_expectation_matches_pauli_sum():
circuit = build_local_circuit(nqubits=4, nlayers=2)
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
max_bond=64,
tensor_module="torch",
)
executor.run_circuit(circuit)
x = np.array([[0, 1], [1, 0]], dtype=complex)
z = np.array([[1, 0], [0, -1]], dtype=complex)
i2 = np.eye(2, dtype=complex)
mpo = [
x.reshape(1, 2, 2, 1),
z.reshape(1, 2, 2, 1),
i2.reshape(1, 2, 2, 1),
i2.reshape(1, 2, 2, 1),
]
mpo_value = executor.expectation_mpo(mpo)
pauli_value = executor.expectation_pauli_sum([(1.0, (("X", 0), ("Z", 1)))])
np.testing.assert_allclose(mpo_value, pauli_value, atol=1e-12)
def test_vidal_backend_accepts_mpo_observable_dict():
circuit = build_local_circuit(nqubits=4, nlayers=2)
x = np.array([[0, 1], [1, 0]], dtype=complex)
z = np.array([[1, 0], [0, -1]], dtype=complex)
i2 = np.eye(2, dtype=complex)
mpo = [
x.reshape(1, 2, 2, 1),
z.reshape(1, 2, 2, 1),
i2.reshape(1, 2, 2, 1),
i2.reshape(1, 2, 2, 1),
]
exact = exact_pauli_sum(circuit, [(1.0, (("X", 0), ("Z", 1)))], 4)
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=64,
tensor_module="torch",
fallback=False,
)
value = backend.expectation(circuit, {"mpo_tensors": mpo}, preprocess=False)
np.testing.assert_allclose(value, exact, atol=1e-12)
def test_vidal_symbolic_hamiltonian_auto_mpo_matches_operator_sum():
circuit = build_local_circuit(nqubits=5, nlayers=2)
observable = hamiltonians.SymbolicHamiltonian(
form=0.3 * X(0) * Z(1) - 0.2j * Y(2) + 0.7 * Z(3) * X(4)
)
executor = VidalTEBDExecutor(
nqubits=circuit.nqubits,
max_bond=64,
tensor_module="torch",
)
executor.run_circuit(circuit)
terms = _symbolic_hamiltonian_to_operator_terms(observable)
term_value = executor.expectation_operator_sum(terms)
mpo_value = executor.expectation_mpo(_operator_terms_to_mpo(terms, circuit.nqubits))
np.testing.assert_allclose(mpo_value, term_value, atol=1e-12)
def test_vidal_backend_accepts_dense_two_qubit_observable():
circuit = Circuit(2)
circuit.add(gates.H(0))
circuit.add(gates.CNOT(0, 1))
bell = np.zeros((4, 4), dtype=complex)
bell[0, 0] = bell[0, 3] = bell[3, 0] = bell[3, 3] = 0.5
observable = {"matrix": bell, "qubits": [0, 1]}
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=16,
tensor_module="torch",
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=False)
np.testing.assert_allclose(value, 1.0, atol=1e-12)
def test_vidal_backend_dense_observable_preserves_complex_value():
circuit = Circuit(2)
circuit.add(gates.H(0))
circuit.add(gates.H(1))
op = np.zeros((4, 4), dtype=complex)
op[0, 3] = 1.0
observable = {"coefficient": 1.0j, "matrix": op, "qubits": [0, 1]}
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=16,
tensor_module="torch",
fallback=False,
)
value = backend.expectation(circuit, observable, preprocess=False)
np.testing.assert_allclose(value, 0.25j, atol=1e-12)
def test_truncation_error_no_truncation():
"""With large bond, truncation error should be essentially zero."""
circuit = build_local_circuit(nqubits=6, nlayers=2)
@@ -141,6 +370,10 @@ def test_truncation_error_no_truncation():
assert backend.last_truncation_error < 1e-14, (
f"Expected near-zero truncation error, got {backend.last_truncation_error}"
)
assert backend.last_max_truncation_error < 1e-14, (
"Expected near-zero max truncation error, got "
f"{backend.last_max_truncation_error}"
)
def test_vidal_backend_matches_statevector_multiterm():

View File

@@ -19,6 +19,22 @@ from qibotn.backends.qmatchatea import QMatchaTeaBackend
from qibotn.backends.vidal_tebd import run_vidal_ring_xz
def optional_int(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return int(text)
def optional_float(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return float(text)
def format_optional(value, fmt="g"):
return "None" if value is None else format(value, fmt)
def build_circuit(nqubits, nlayers, seed):
return build_benchmark_circuit("brickwall_cnot", nqubits, nlayers, seed)
@@ -35,7 +51,8 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument("--nqubits", type=int, default=40)
parser.add_argument("--nlayers", type=int, default=30)
parser.add_argument("--bond", "--bonds", dest="bond", type=int, default=512)
parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=512)
parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--tensor-module", choices=("numpy", "torch"), default="torch")
parser.add_argument("--torch-threads", type=int, default=32)
@@ -109,7 +126,8 @@ def main():
mpi_label = f"MPIMPS/{size}" if args.mpi_ct else "SR"
print(
f"nqubits={args.nqubits} nlayers={args.nlayers} "
f"bond={args.bond} seed={args.seed} "
f"bond={format_optional(args.bond)} "
f"cut_ratio={format_optional(args.cut_ratio)} seed={args.seed} "
f"tensor_module={args.tensor_module} svd_control=E! "
f"compile_circuit=True mpi={mpi_label} executor={args.executor}"
)
@@ -128,7 +146,7 @@ def main():
value, timings = run_segment_vidal_mpi_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
cut_ratio=args.cut_ratio,
tensor_module=args.tensor_module,
comm=MPI.COMM_WORLD,
)
@@ -136,7 +154,7 @@ def main():
value = run_vidal_ring_xz(
circuit,
max_bond=args.bond,
cut_ratio=1e-12,
cut_ratio=args.cut_ratio,
tensor_module=args.tensor_module,
)
else:
@@ -144,7 +162,7 @@ def main():
backend.configure_tn_simulation(
ansatz="MPS",
max_bond_dimension=args.bond,
cut_ratio=1e-12,
cut_ratio=args.cut_ratio,
svd_control="E!",
tensor_module=args.tensor_module,
compile_circuit=True,

33
tools/example_tn_case.py Normal file
View File

@@ -0,0 +1,33 @@
"""Example custom case for tools/run_tn_custom.py."""
from __future__ import annotations
import math
import numpy as np
from qibo import Circuit, gates
def build_circuit(nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.7, 0.7)))
return circuit
def build_observable(nqubits, seed):
return {
"terms": [
{
"coefficient": 1.0 / max(1, nqubits - 1),
"operators": [("Z", site), ("Z", site + 1)],
}
for site in range(nqubits - 1)
]
}

View File

@@ -0,0 +1,208 @@
"""Inspect cotengra contraction trees for dominant torch matmul shapes."""
from __future__ import annotations
import argparse
import importlib
import math
import pickle
from collections import Counter, defaultdict
from pathlib import Path
def _prod(values):
out = 1
for value in values:
out *= int(value)
return out
def _broadcast_batch(a_batch, b_batch):
if a_batch == b_batch:
return _prod(a_batch)
if not a_batch:
return _prod(b_batch)
if not b_batch:
return _prod(a_batch)
ndim = max(len(a_batch), len(b_batch))
a_batch = (1,) * (ndim - len(a_batch)) + tuple(a_batch)
b_batch = (1,) * (ndim - len(b_batch)) + tuple(b_batch)
return _prod(max(a, b) for a, b in zip(a_batch, b_batch))
def _load_tree(path, index):
with Path(path).open("rb") as f:
payload = pickle.load(f)
trees = payload["trees"] if isinstance(payload, dict) else payload
if not isinstance(trees, (list, tuple)):
trees = [trees]
return trees[index]
def _analyze_tree(tree):
contract_mod = importlib.import_module("cotengra.contract")
contractions = contract_mod.extract_contractions(tree)
size_dict = tree.size_dict
ops = []
counts = Counter()
for op_index, (parent, left, right, tdot, arg, perm) in enumerate(contractions):
if left is None and right is None:
counts["preprocess"] += 1
continue
left_inds = tree.get_inds(left)
right_inds = tree.get_inds(right)
parent_inds = tree.get_inds(parent)
left_shape = tuple(size_dict[ix] for ix in left_inds)
right_shape = tuple(size_dict[ix] for ix in right_inds)
if tdot:
parsed = contract_mod._parse_tensordot_axes_to_matmul(
arg,
left_shape,
right_shape,
)
else:
parsed = contract_mod._parse_eq_to_batch_matmul(
arg,
left_shape,
right_shape,
)
(
_eq_a,
_eq_b,
new_shape_a,
new_shape_b,
_new_shape_ab,
_perm_ab,
pure_multiplication,
) = parsed
matmul_shape = None
matmul_flops = 0
if pure_multiplication:
kind = "mul"
else:
a_shape = tuple(new_shape_a or left_shape)
b_shape = tuple(new_shape_b or right_shape)
batch = _broadcast_batch(a_shape[:-2], b_shape[:-2])
m, k, n = int(a_shape[-2]), int(a_shape[-1]), int(b_shape[-1])
kind = "mm" if batch == 1 else "bmm"
matmul_shape = (batch, m, k, n)
matmul_flops = batch * m * k * n
tree_flops = int(tree.get_flops(parent))
out_size = int(tree.get_size(parent))
ops.append(
{
"index": op_index,
"kind": kind,
"matmul_shape": matmul_shape,
"matmul_flops": matmul_flops,
"tree_flops": tree_flops,
"out_size": out_size,
"left_shape": left_shape,
"right_shape": right_shape,
"left_rank": len(left_inds),
"right_rank": len(right_inds),
"out_rank": len(parent_inds),
"perm": perm,
}
)
counts[kind] += 1
return contractions, ops, counts
def _format_log(value, base):
return "-inf" if value <= 0 else f"{math.log(value, base):.3f}"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("tree", help="Pickle file containing one tree or {'trees': [...]}.")
parser.add_argument("--index", type=int, default=0, help="Tree index in the file.")
parser.add_argument("--top", type=int, default=20, help="Number of top ops to print.")
parser.add_argument(
"--dtype-bytes",
type=int,
default=8,
help="Bytes per element for memory estimates, for example 8 for complex64.",
)
args = parser.parse_args()
tree = _load_tree(args.tree, args.index)
contractions, ops, counts = _analyze_tree(tree)
nslices = int(getattr(tree, "multiplicity", 1))
per_slice_flops = sum(op["tree_flops"] for op in ops)
per_slice_write = sum(op["out_size"] for op in ops)
max_out = max((op["out_size"] for op in ops), default=0)
all_flops = per_slice_flops * nslices
all_write = per_slice_write * nslices
print(f"tree={args.tree} index={args.index}")
print(
"summary "
f"slices={nslices} contractions={len(contractions)} "
f"counts={dict(counts)}"
)
print(
"per_slice "
f"log10_flops={_format_log(per_slice_flops, 10)} "
f"log10_write={_format_log(per_slice_write, 10)} "
f"log2_max_output={_format_log(max_out, 2)} "
f"max_output_gib={max_out * args.dtype_bytes / 1024**3:.6g}"
)
print(
"all_slices "
f"log10_flops={_format_log(all_flops, 10)} "
f"log10_write={_format_log(all_write, 10)}"
)
print(f"\ntop_{args.top}_ops_by_flops")
for op in sorted(ops, key=lambda item: item["tree_flops"], reverse=True)[: args.top]:
print(
f"op={op['index']} kind={op['kind']} "
f"flops={op['tree_flops']:.6e} out={op['out_size']:.6e} "
f"matmul={op['matmul_shape']} "
f"ranks=({op['left_rank']},{op['right_rank']}->{op['out_rank']}) "
f"lhs={op['left_shape']} rhs={op['right_shape']}"
)
by_shape = defaultdict(lambda: [0, 0, 0])
for op in ops:
shape = op["matmul_shape"]
if shape is None:
continue
by_shape[shape][0] += 1
by_shape[shape][1] += op["tree_flops"]
by_shape[shape][2] += op["out_size"]
print(f"\ntop_{args.top}_matmul_shapes_by_flops")
for shape, (count, flops, out_size) in sorted(
by_shape.items(),
key=lambda item: item[1][1],
reverse=True,
)[: args.top]:
print(
f"shape={shape} count={count} "
f"flops={flops:.6e} output={out_size:.6e}"
)
print(f"\ntop_{args.top}_matmul_shapes_by_count")
for shape, (count, flops, out_size) in sorted(
by_shape.items(),
key=lambda item: item[1][0],
reverse=True,
)[: args.top]:
print(
f"shape={shape} count={count} "
f"flops={flops:.6e} output={out_size:.6e}"
)
if __name__ == "__main__":
main()

223
tools/manage_tn_dask_cluster.sh Executable file
View File

@@ -0,0 +1,223 @@
#!/usr/bin/env bash
set -euo pipefail
# Manage the dask cluster used by TN path search.
#
# Defaults target two servers:
# scheduler: 10.20.1.103:8786
# workers: 10.20.1.103, 10.20.1.102
#
# Usage:
# tools/manage_tn_dask_cluster.sh start
# tools/manage_tn_dask_cluster.sh status
# tools/manage_tn_dask_cluster.sh stop
#
# Common overrides:
# SCHEDULER_HOST=10.20.1.103
# WORKER_HOSTS="10.20.1.103 10.20.1.102"
# NWORKERS=48
# NTHREADS=1
# ROOT_DIR=/home/yx/qibotn
# PYTHON_BIN=.venv/bin/python
ROOT_DIR="${ROOT_DIR:-/home/yx/qibotn}"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
SCHEDULER_HOST="${SCHEDULER_HOST:-10.20.1.103}"
SCHEDULER_PORT="${SCHEDULER_PORT:-8786}"
DASHBOARD_ADDRESS="${DASHBOARD_ADDRESS:-:8787}"
WORKER_HOSTS="${WORKER_HOSTS:-10.20.1.103 10.20.1.102}"
NWORKERS="${NWORKERS:-48}"
NTHREADS="${NTHREADS:-1}"
MEMORY_LIMIT="${MEMORY_LIMIT:-0}"
LOCAL_DIRECTORY="${LOCAL_DIRECTORY:-/tmp/qibotn-dask}"
LOG_DIR="${LOG_DIR:-$ROOT_DIR/logs/dask}"
SSH_BIN="${SSH_BIN:-ssh}"
DASK_WORKER_TTL="${DASK_WORKER_TTL:-24 hours}"
DASK_TICK_LIMIT="${DASK_TICK_LIMIT:-30 minutes}"
DASK_LOST_WORKER_TIMEOUT="${DASK_LOST_WORKER_TIMEOUT:-30 minutes}"
SCHEDULER_ADDR="tcp://${SCHEDULER_HOST}:${SCHEDULER_PORT}"
is_local_host() {
local host="$1"
[[ "$host" == "localhost" || "$host" == "127.0.0.1" ]] && return 0
[[ "$host" == "$(hostname)" ]] && return 0
[[ "$host" == "$(hostname -f 2>/dev/null || true)" ]] && return 0
hostname -I 2>/dev/null | tr ' ' '\n' | grep -qx "$host"
}
run_on_host() {
local host="$1"
shift
local cmd="$*"
if is_local_host "$host"; then
bash -lc "$cmd"
else
"$SSH_BIN" "$host" "bash -lc $(printf '%q' "$cmd")"
fi
}
start_scheduler() {
local host="$SCHEDULER_HOST"
local log="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.log"
local pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
run_on_host "$host" "
set -euo pipefail
cd '$ROOT_DIR'
mkdir -p '$LOG_DIR'
if [[ -s '$pid_file' ]]; then
pid=\$(cat '$pid_file')
if kill -0 \"\$pid\" 2>/dev/null; then
echo \"scheduler already running on $host pid=\$pid\"
exit 0
fi
fi
DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
setsid '$PYTHON_BIN' -m distributed.cli.dask_scheduler \
--host '$SCHEDULER_HOST' \
--port '$SCHEDULER_PORT' \
--dashboard-address '$DASHBOARD_ADDRESS' \
> '$log' 2>&1 < /dev/null &
pid=\$!
echo \"\$pid\" > '$pid_file'
echo \"scheduler host=$host pid=\$pid addr=$SCHEDULER_ADDR log=$log\"
"
}
start_worker() {
local host="$1"
local log="$LOG_DIR/worker_${host}.log"
local pid_file="$LOG_DIR/worker_${host}.pid"
run_on_host "$host" "
set -euo pipefail
cd '$ROOT_DIR'
mkdir -p '$LOG_DIR' '$LOCAL_DIRECTORY'
if [[ -s '$pid_file' ]]; then
pid=\$(cat '$pid_file')
if kill -0 \"\$pid\" 2>/dev/null; then
echo \"worker already running on $host pid=\$pid\"
exit 0
fi
fi
TCM_ENABLE=1 \
DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL='$DASK_WORKER_TTL' \
DASK_DISTRIBUTED__ADMIN__TICK__LIMIT='$DASK_TICK_LIMIT' \
DASK_DISTRIBUTED__DEPLOY__LOST_WORKER_TIMEOUT='$DASK_LOST_WORKER_TIMEOUT' \
setsid '$PYTHON_BIN' -m distributed.cli.dask_worker \
'$SCHEDULER_ADDR' \
--host '$host' \
--nworkers '$NWORKERS' \
--nthreads '$NTHREADS' \
--memory-limit '$MEMORY_LIMIT' \
--local-directory '$LOCAL_DIRECTORY' \
> '$log' 2>&1 < /dev/null &
pid=\$!
echo \"\$pid\" > '$pid_file'
echo \"worker host=$host pid=\$pid scheduler=$SCHEDULER_ADDR log=$log\"
"
}
stop_host() {
local host="$1"
local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
local worker_pid_file="$LOG_DIR/worker_${host}.pid"
run_on_host "$host" "
set +e
for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
[[ -f \"\$pid_file\" ]] || continue
if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
continue
fi
pid=\$(cat \"\$pid_file\")
kill \"\$pid\" 2>/dev/null || true
rm -f \"\$pid_file\"
done
pkill -f '[d]istributed.cli.dask_worker.*$SCHEDULER_ADDR'
pkill -f '[d]istributed.cli.dask_scheduler.*--port $SCHEDULER_PORT'
true
"
}
status_host() {
local host="$1"
local scheduler_pid_file="$LOG_DIR/scheduler_${SCHEDULER_HOST}_${SCHEDULER_PORT}.pid"
local worker_pid_file="$LOG_DIR/worker_${host}.pid"
echo "--------------------------------------------------------------------------------"
echo "host=$host"
run_on_host "$host" "
set +e
for pid_file in '$worker_pid_file' '$scheduler_pid_file'; do
[[ -f \"\$pid_file\" ]] || continue
if [[ \"\$pid_file\" == '$scheduler_pid_file' && '$host' != '$SCHEDULER_HOST' ]]; then
continue
fi
pid=\$(cat \"\$pid_file\")
if kill -0 \"\$pid\" 2>/dev/null; then
ps -p \"\$pid\" -o pid,ppid,stat,etime,cmd --no-headers
else
echo \"stale pid_file=\$pid_file pid=\$pid\"
fi
done
pgrep -af '[d]istributed.cli.dask' || true
"
}
case "${1:-help}" in
start)
start_scheduler
sleep 2
for host in $WORKER_HOSTS; do
start_worker "$host"
done
echo
echo "Dask scheduler: $SCHEDULER_ADDR"
echo "Dashboard: http://$SCHEDULER_HOST$DASHBOARD_ADDRESS"
;;
stop)
for host in $WORKER_HOSTS; do
stop_host "$host"
done
stop_host "$SCHEDULER_HOST"
;;
status)
status_host "$SCHEDULER_HOST"
for host in $WORKER_HOSTS; do
[[ "$host" == "$SCHEDULER_HOST" ]] && continue
status_host "$host"
done
;;
restart)
"$0" stop
sleep 2
"$0" start
;;
help|*)
cat <<EOF
Usage: tools/manage_tn_dask_cluster.sh [start|stop|restart|status]
Defaults:
SCHEDULER_HOST=$SCHEDULER_HOST
SCHEDULER_PORT=$SCHEDULER_PORT
WORKER_HOSTS="$WORKER_HOSTS"
NWORKERS=$NWORKERS
NTHREADS=$NTHREADS
ROOT_DIR=$ROOT_DIR
PYTHON_BIN=$PYTHON_BIN
DASK_WORKER_TTL="$DASK_WORKER_TTL"
DASK_TICK_LIMIT=$DASK_TICK_LIMIT
DASK_LOST_WORKER_TIMEOUT=$DASK_LOST_WORKER_TIMEOUT
Search command after start:
TCM_ENABLE=1 python -u tools/tn_contest_runner.py search \\
--case main1 \\
--dask-address $SCHEDULER_ADDR \\
--torch-threads 48 \\
--dtype complex64 \\
--tn-search-repeats 2048 \\
--tn-search-time 300
EOF
exit 2
;;
esac

313
tools/mps_contest_runner.py Normal file
View File

@@ -0,0 +1,313 @@
#!/usr/bin/env python
"""Contest-style multi-node Vidal/MPS expectation runner."""
from __future__ import annotations
import argparse
import math
import sys
import time
from dataclasses import dataclass
from pathlib import Path
import numpy as np
from mpi4py import MPI
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from qibotn.backends.vidal import VidalBackend # noqa: E402
from qibotn.expectation_runner import exact_for_observable # noqa: E402
@dataclass(frozen=True)
class CaseSpec:
circuit_kind: str
observables: tuple[str, ...]
nqubits: int
nlayers: int
bond: int | None
seed: int
CASES = {
"main1": CaseSpec(
circuit_kind="reversed_cnot",
observables=("ring_xz",),
nqubits=128,
nlayers=24,
bond=512,
seed=31001,
),
"main2": CaseSpec(
circuit_kind="rxx_rzz",
observables=("open_zz", "range2_xx", "mixed_local"),
nqubits=128,
nlayers=32,
bond=1024,
seed=31002,
),
"strong": CaseSpec(
circuit_kind="scramble",
observables=("ring_xz", "long_z_string", "dense3_spread"),
nqubits=256,
nlayers=48,
bond=2048,
seed=41001,
),
}
def optional_int(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return int(text)
def optional_float(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return float(text)
def format_optional(value, fmt="g"):
return "None" if value is None else format(value, fmt)
def set_torch_threads(nthreads):
try:
import torch
torch.set_num_threads(nthreads)
except Exception:
pass
def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
if include_rx:
circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
def build_circuit(kind, nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
if kind == "reversed_cnot":
add_single_qubit_layer(circuit, nqubits, rng)
for qubit in range(0, nqubits - 1, 2):
gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
circuit.add(gate)
for qubit in range(1, nqubits - 1, 2):
gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
circuit.add(gate)
elif kind == "rxx_rzz":
add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
elif kind == "scramble":
add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
if layer % 5 == 4:
circuit.add(gates.SWAP(qubit, qubit + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def dense_observable(nqubits, qubits, seed, dim):
del nqubits
rng = np.random.default_rng(seed)
raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
matrix = (raw + raw.conj().T) / 2.0
matrix = matrix / np.linalg.norm(matrix)
return {"matrix": matrix, "qubits": list(qubits)}
def observable(kind, nqubits, seed):
q1 = nqubits // 4
q2 = nqubits // 2
q3 = (3 * nqubits) // 4
last = nqubits - 1
if kind == "boundary_ZZ_q1":
return hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))
if kind == "boundary_ZZ_q2":
return hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))
if kind == "boundary_ZZ_q3":
return hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))
if kind == "long_Z_5_sites":
return hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last))
if kind == "mixed_XZYZX":
return hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last))
if kind == "ring_xz":
form = 0
for qubit in range(nqubits):
form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "open_zz":
form = 0
for qubit in range(nqubits - 1):
form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "range2_xx":
form = 0
for qubit in range(nqubits - 2):
form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "mixed_local":
form = 0.25 * X(0) - 0.5 * Z(last) + 0.125 * X(q1) * Z(q2) * Y(q3)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "complex_iZ0":
return hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))
if kind == "dense2_mid":
return dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)
if kind == "dense3_spread":
return dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)
raise ValueError(f"Unknown observable kind {kind!r}.")
def selected_observables(args, case):
if args.observables:
return tuple(args.observables)
if args.obs_filter:
return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
return case.observables
def apply_case_defaults(args):
case = CASES[args.case]
if args.nqubits is None:
args.nqubits = case.nqubits
if args.nlayers is None:
args.nlayers = case.nlayers
if args.bond == "case-default":
args.bond = case.bond
if args.seed is None:
args.seed = case.seed
args.observables = selected_observables(args, case)
def run_case(args):
set_torch_threads(args.torch_threads)
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
case = CASES[args.case]
circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
if rank == 0:
print("=" * 88, flush=True)
print(
"backend=vidal_mps "
f"case={args.case} circuit={case.circuit_kind} ranks={size} "
f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
f"bond={format_optional(args.bond)} cut_ratio={format_optional(args.cut_ratio)} "
f"torch_threads={args.torch_threads} seed={args.seed} "
f"observables={','.join(args.observables)}",
flush=True,
)
print("observable exact value abs_error rel_error seconds trunc_sum trunc_max status", flush=True)
for obs_name in args.observables:
obs = observable(obs_name, args.nqubits, args.seed)
exact = None
if args.exact and rank == 0:
if args.nqubits > args.exact_max_qubits:
raise ValueError(
f"--exact is limited to {args.exact_max_qubits} qubits by default."
)
exact = exact_for_observable(circuit, obs, args.nqubits)
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
mpi_approach="CT",
mpi_num_procs=size,
fallback=False,
)
comm.Barrier()
start = time.perf_counter()
try:
value = backend.expectation(
circuit,
obs,
preprocess=True,
compile_circuit=False,
)
status = "ok"
except Exception as exc:
value = np.nan
status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
seconds = time.perf_counter() - start
if rank == 0:
abs_error = float("nan") if exact is None else abs(value - exact)
rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
exact_text = "nan" if exact is None else f"{exact:.16e}"
print(
f"{obs_name} {exact_text} {value!r} "
f"{abs_error:.6e} {rel_error:.6e} {seconds:.3f} "
f"{backend.last_truncation_error:.6e} "
f"{backend.last_max_truncation_error:.6e} {status}",
flush=True,
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("mode", choices=("run", "validate", "list"))
parser.add_argument("--case", choices=sorted(CASES), default="main1")
parser.add_argument("--observables", nargs="+")
parser.add_argument("--obs-filter", default="")
parser.add_argument("--nqubits", type=int)
parser.add_argument("--nlayers", type=int)
parser.add_argument("--bond", "--bonds", dest="bond", default="case-default")
parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
parser.add_argument("--seed", type=int)
parser.add_argument("--torch-threads", type=int, default=8)
parser.add_argument("--exact", action="store_true")
parser.add_argument("--exact-max-qubits", type=int, default=24)
args = parser.parse_args()
if args.mode == "list":
for name, case in CASES.items():
print(
f"{name}: circuit={case.circuit_kind} "
f"observables={','.join(case.observables)} "
f"nqubits={case.nqubits} nlayers={case.nlayers} "
f"bond={case.bond} seed={case.seed}"
)
return
apply_case_defaults(args)
if isinstance(args.bond, str):
args.bond = optional_int(args.bond)
if args.mode == "validate":
args.exact = True
args.nqubits = min(args.nqubits, args.exact_max_qubits)
run_case(args)
if __name__ == "__main__":
main()

243
tools/run_tn_custom.py Normal file
View File

@@ -0,0 +1,243 @@
#!/usr/bin/env python
"""Run TN expectation for a user-provided circuit and observable.
The case module should define:
def build_circuit(nqubits, nlayers, seed): ...
def build_observable(nqubits, seed): ...
``build_observable`` may return a Qibo SymbolicHamiltonian/form or the qibotn
dict form:
{"terms": [
{"coefficient": 1.0, "operators": [("X", 0), ("Z", 1)]},
]}
For a single repeated Pauli string, pass ``--pauli-pattern`` instead of
defining ``build_observable``.
"""
from __future__ import annotations
import argparse
import importlib.util
import inspect
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from qibotn.expectation_runner import ( # noqa: E402
ExpectationConfig,
exact_for_observable,
run_cpu_expectation,
)
def optional_int(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return int(text)
def optional_float(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return float(text)
def load_module(path):
path = Path(path).resolve()
spec = importlib.util.spec_from_file_location(path.stem, path)
if spec is None or spec.loader is None:
raise RuntimeError(f"Cannot import case module from {path}.")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module
def call_builder(fn, **kwargs):
sig = inspect.signature(fn)
if any(p.kind == p.VAR_KEYWORD for p in sig.parameters.values()):
return fn(**kwargs)
accepted = {
name: value
for name, value in kwargs.items()
if name in sig.parameters
}
return fn(**accepted)
def load_observable(args, module):
if args.pauli_pattern:
return {"pauli_string_pattern": args.pauli_pattern}
if args.observable_json:
with Path(args.observable_json).open() as f:
return json.load(f)
if hasattr(module, "build_observable"):
return call_builder(
module.build_observable,
nqubits=args.nqubits,
nlayers=args.nlayers,
seed=args.seed,
)
if hasattr(module, "OBSERVABLE"):
return module.OBSERVABLE
raise ValueError(
"No observable supplied. Define build_observable/OBSERVABLE in the case "
"module, or pass --pauli-pattern / --observable-json."
)
def build_parallel_opts(args):
slicing_opts = {}
if args.tn_target_slices is not None:
slicing_opts["target_slices"] = args.tn_target_slices
if args.tn_target_size is not None:
slicing_opts["target_size"] = args.tn_target_size
opts = {
"slicing_opts": slicing_opts or None,
"search_workers": args.tn_search_workers or args.torch_threads,
"max_repeats": args.tn_search_repeats,
"max_time": args.tn_search_time,
"print_stats": not args.no_tn_stats,
}
if args.tn_search_backend is not None:
opts["search_backend"] = args.tn_search_backend
if args.dask_address is not None:
opts["dask_address"] = args.dask_address
if args.dask_close_workers:
opts["dask_close_workers"] = True
if args.tn_save_tree is not None:
opts["save_tree_path"] = args.tn_save_tree
if args.tn_load_tree is not None:
opts["load_tree_path"] = args.tn_load_tree
if args.tn_search_only:
opts["search_only"] = True
return opts
def main():
parser = argparse.ArgumentParser(
description="Run CPU TN expectation for a custom qibo circuit module."
)
parser.add_argument("case_module", help="Python file defining build_circuit.")
parser.add_argument("--nqubits", type=int, required=True)
parser.add_argument("--nlayers", type=int, default=0)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--mpi", action="store_true")
parser.add_argument("--exact", action="store_true")
parser.add_argument("--exact-max-qubits", type=int, default=24)
parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
parser.add_argument("--torch-threads", type=int, default=8)
parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex128")
parser.add_argument("--pauli-pattern")
parser.add_argument("--observable-json")
parser.add_argument("--tn-target-slices", type=int)
parser.add_argument("--tn-target-size", type=int, default=2**32)
parser.add_argument("--tn-search-workers", type=int)
parser.add_argument("--tn-search-repeats", type=int, default=128)
parser.add_argument("--tn-search-time", type=float, default=60.0)
parser.add_argument("--tn-search-backend", choices=("processpool", "dask"))
parser.add_argument("--dask-address")
parser.add_argument("--dask-close-workers", action="store_true")
parser.add_argument("--tn-save-tree")
parser.add_argument("--tn-load-tree")
parser.add_argument("--tn-search-only", action="store_true")
parser.add_argument("--no-tn-stats", action="store_true")
args = parser.parse_args()
rank = 0
if args.mpi:
from mpi4py import MPI
rank = MPI.COMM_WORLD.Get_rank()
module = load_module(args.case_module)
if not hasattr(module, "build_circuit"):
raise ValueError("case_module must define build_circuit.")
circuit = call_builder(
module.build_circuit,
nqubits=args.nqubits,
nlayers=args.nlayers,
seed=args.seed,
)
observable = load_observable(args, module)
config = ExpectationConfig(
ansatz="tn",
mpi=args.mpi,
bond=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
quimb_backend=args.quimb_backend,
dtype=args.dtype,
torch_threads=args.torch_threads,
parallel_opts=build_parallel_opts(args),
)
if rank == 0:
mode = "MPI" if args.mpi else "serial"
print(
f"backend=cpu ansatz=TN mode={mode} case={Path(args.case_module).name} "
f"nqubits={args.nqubits} nlayers={args.nlayers} seed={args.seed} "
f"quimb_backend={args.quimb_backend} dtype={args.dtype} "
f"torch_threads={args.torch_threads}",
flush=True,
)
print("observable exact value abs_error rel_error seconds", flush=True)
exact = None
if args.exact and rank == 0:
if args.nqubits > args.exact_max_qubits:
raise ValueError(
f"--exact is limited to {args.exact_max_qubits} qubits by default."
)
exact = exact_for_observable(circuit, observable, args.nqubits)
result = run_cpu_expectation(circuit, observable, config)
if args.mpi and result.rank != 0:
return
abs_error = float("nan") if exact is None else abs(result.value - exact)
rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
exact_text = "nan" if exact is None else f"{exact:.16e}"
print(
f"custom {exact_text} {result.value:.16e} "
f"{abs_error:.6e} {rel_error:.6e} {result.seconds:.3f}",
flush=True,
)
for stat in result.parallel_stats or ():
cost = stat["path_cost"]
search_stats = stat.get("search_stats", {})
print(
"tn_term_summary "
f"term={stat.get('term_index', 0)} "
f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
f"completed_trials={search_stats.get('completed_trials', 'na')} "
f"finite_trials={search_stats.get('finite_trials', 'na')} "
f"failed_trials={search_stats.get('failed_trials', 'na')} "
f"requested_trials={search_stats.get('requested_trials', 'na')} "
f"best_score={search_stats.get('best_score', float('nan')):.6g} "
f"slices={cost.get('slices')} "
f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
f"log10_write={cost.get('log10_write', float('nan')):.3f} "
f"log2_size={cost.get('log2_size', float('nan')):.3f} "
f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
f"rank_slices={stat.get('rank_slices')}",
flush=True,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,340 @@
#!/usr/bin/env bash
set -euo pipefail
# Contest-style Vidal/MPI MPS cases.
#
# Usage:
# tools/run_vidal_mpi_contest_cases.sh main1
# tools/run_vidal_mpi_contest_cases.sh main2
# tools/run_vidal_mpi_contest_cases.sh strong
# tools/run_vidal_mpi_contest_cases.sh all
#
# Common overrides:
# PYTHON_BIN=.venv/bin/python
# MPIEXEC=mpiexec
# MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
# HOSTFILE=hostfile # optional; used only if the file exists
# RANKS=8
# TORCH_THREADS=8
# CUT_RATIO=1e-12
# OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
#
# Per-case overrides:
# MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
# MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
# STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"
PYTHON_BIN="${PYTHON_BIN:-.venv/bin/python}"
MPIEXEC="${MPIEXEC:-mpiexec}"
HOSTFILE="${HOSTFILE:-}"
RANKS="${RANKS:-4}"
TORCH_THREADS="${TORCH_THREADS:-1}"
CUT_RATIO="${CUT_RATIO:-1e-12}"
OBS_FILTER="${OBS_FILTER:-}"
RUNNER_DIR="$ROOT_DIR/.tmp"
mkdir -p "$RUNNER_DIR"
RUNNER="$(mktemp "$RUNNER_DIR/qibotn_vidal_contest.XXXXXX.py")"
cleanup() {
rm -f "$RUNNER"
}
trap cleanup EXIT
cat > "$RUNNER" <<'PY'
from __future__ import annotations
import argparse
import math
import time
import numpy as np
from mpi4py import MPI
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
from qibotn.backends.vidal import VidalBackend
def set_torch_threads(nthreads):
try:
import torch
torch.set_num_threads(nthreads)
except Exception:
pass
def build_circuit(kind, nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
if kind in ("rxx_rzz", "scramble"):
circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
if kind == "reversed_cnot":
for q in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
for q in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
elif kind == "rxx_rzz":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
elif kind == "scramble":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
if layer % 5 == 4:
circuit.add(gates.SWAP(q, q + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def ring_xz(nqubits):
form = 0
for q in range(nqubits):
form += 0.5 * X(q) * Z((q + 1) % nqubits)
return hamiltonians.SymbolicHamiltonian(form=form)
def open_zz(nqubits):
form = 0
for q in range(nqubits - 1):
form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
return hamiltonians.SymbolicHamiltonian(form=form)
def range2_xx(nqubits):
form = 0
for q in range(nqubits - 2):
form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
return hamiltonians.SymbolicHamiltonian(form=form)
def dense_observable(nqubits, qubits, seed, dim):
rng = np.random.default_rng(seed)
raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
matrix = (raw + raw.conj().T) / 2.0
matrix = matrix / np.linalg.norm(matrix)
return {"matrix": matrix, "qubits": list(qubits)}
def observables_for_case(nqubits, seed):
q1 = nqubits // 4
q2 = nqubits // 2
q3 = (3 * nqubits) // 4
last = nqubits - 1
return [
("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
(
"long_Z_5_sites",
hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
),
(
"mixed_XZYZX",
hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
),
("ring_xz", ring_xz(nqubits)),
("open_zz", open_zz(nqubits)),
("range2_xx", range2_xx(nqubits)),
("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
]
def run_case(args):
set_torch_threads(args.torch_threads)
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
observables = observables_for_case(args.nqubits, args.seed)
if args.obs_filter:
wanted = set(args.obs_filter.split(","))
observables = [(name, obs) for name, obs in observables if name in wanted]
if not observables:
raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
if rank == 0:
print("=" * 88, flush=True)
print(
"case "
f"label={args.label} kind={args.kind} ranks={size} "
f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
f"bond={args.bond} cut_ratio={args.cut_ratio:g} "
f"torch_threads={args.torch_threads} seed={args.seed} "
f"obs_filter={args.obs_filter or 'all'}",
flush=True,
)
print(
"observable value seconds trunc_sum trunc_max status",
flush=True,
)
for obs_name, observable in observables:
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
mpi_approach="CT",
mpi_num_procs=size,
fallback=False,
)
comm.Barrier()
start = time.perf_counter()
try:
value = backend.expectation(
circuit,
observable,
preprocess=True,
compile_circuit=False,
)
status = "ok"
except Exception as exc: # pragma: no cover - printed for manual runs
value = np.nan
status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
seconds = time.perf_counter() - start
if rank == 0:
print(
f"{obs_name} {value!r} {seconds:.3f} "
f"{backend.last_truncation_error:.6e} "
f"{backend.last_max_truncation_error:.6e} {status}",
flush=True,
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--label", required=True)
parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
parser.add_argument("--nqubits", type=int, required=True)
parser.add_argument("--nlayers", type=int, required=True)
parser.add_argument("--bond", type=int, required=True)
parser.add_argument("--cut-ratio", type=float, required=True)
parser.add_argument("--seed", type=int, required=True)
parser.add_argument("--torch-threads", type=int, required=True)
parser.add_argument("--obs-filter", default="")
run_case(parser.parse_args())
if __name__ == "__main__":
main()
PY
if [[ -n "${MPIEXEC_FULL:-}" ]]; then
read -r -a mpi_prefix <<< "$MPIEXEC_FULL"
else
mpi_prefix=("$MPIEXEC")
if [[ -n "$HOSTFILE" && -f "$HOSTFILE" ]]; then
mpi_prefix+=("-hostfile" "$HOSTFILE")
fi
mpi_prefix+=("-n" "$RANKS")
fi
run_case() {
local label="$1"
local kind="$2"
local nq="$3"
local layers="$4"
local bond="$5"
local seed="$6"
echo
echo "Running $label: kind=$kind nqubits=$nq layers=$layers bond=$bond seed=$seed"
echo "MPI: ${mpi_prefix[*]}"
"${mpi_prefix[@]}" "$PYTHON_BIN" -u "$ROOT_DIR/tools/vidal_mpi_contest_runner.py" \
--label "$label" \
--kind "$kind" \
--nqubits "$nq" \
--nlayers "$layers" \
--bond "$bond" \
--cut-ratio "$CUT_RATIO" \
--seed "$seed" \
--torch-threads "$TORCH_THREADS" \
--obs-filter "$(tr ' ' ',' <<< "$OBS_FILTER")"
}
case "${1:-help}" in
main1)
run_case \
"main1-reversed-cnot" \
"reversed_cnot" \
"${MAIN1_NQ:-128}" \
"${MAIN1_LAYERS:-50}" \
"${MAIN1_BOND:-1024}" \
"${MAIN1_SEED:-31001}"
;;
main2)
run_case \
"main2-rxx-rzz" \
"rxx_rzz" \
"${MAIN2_NQ:-128}" \
"${MAIN2_LAYERS:-64}" \
"${MAIN2_BOND:-2048}" \
"${MAIN2_SEED:-31002}"
;;
strong)
run_case \
"strong-scramble" \
"scramble" \
"${STRONG_NQ:-256}" \
"${STRONG_LAYERS:-64}" \
"${STRONG_BOND:-2048}" \
"${STRONG_SEED:-41001}"
;;
all)
"$0" main1
"$0" main2
"$0" strong
;;
smoke)
MAIN1_NQ="${MAIN1_NQ:-32}" \
MAIN1_LAYERS="${MAIN1_LAYERS:-6}" \
MAIN1_BOND="${MAIN1_BOND:-128}" \
"$0" main1
;;
help|*)
cat >&2 <<'EOF'
Usage: tools/run_vidal_mpi_contest_cases.sh [main1|main2|strong|all|smoke]
Cases:
main1 128 qubits, 50 layers, reversed-CNOT brickwall, chi=1024
main2 128 qubits, 64 layers, RXX/RZZ brickwall, chi=2048
strong 256 qubits, 64 layers, RXX/RZZ + periodic SWAP scramble, chi=2048
smoke Small syntax/runtime check of main1
Common overrides:
PYTHON_BIN=.venv/bin/python
MPIEXEC=mpiexec
MPIEXEC_FULL="mpirun -np 4 -hostfile /home/yx/qibotn/hostfile -perhost 2"
HOSTFILE=hostfile
RANKS=8
TORCH_THREADS=8
CUT_RATIO=1e-12
OBS_FILTER="boundary_ZZ_q2 ring_xz dense3_spread complex_iZ0"
Per-case overrides:
MAIN1_NQ=128 MAIN1_LAYERS=50 MAIN1_BOND=1024 MAIN1_SEED=31001
MAIN2_NQ=128 MAIN2_LAYERS=64 MAIN2_BOND=2048 MAIN2_SEED=31002
STRONG_NQ=256 STRONG_LAYERS=64 STRONG_BOND=2048 STRONG_SEED=41001
EOF
exit 2
;;
esac

View File

@@ -0,0 +1,59 @@
"""Slice an existing saved cotengra tree without re-running path search."""
from __future__ import annotations
import argparse
import pickle
from pathlib import Path
from qibotn.parallel import contraction_tree_costs
def main():
parser = argparse.ArgumentParser()
parser.add_argument("input", help="Input pickle saved by --tn-save-tree.")
parser.add_argument("output", help="Output pickle path.")
parser.add_argument("--term", type=int, default=0)
parser.add_argument("--target-slices", type=int, default=2)
parser.add_argument("--max-repeats", type=int, default=64)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
input_path = Path(args.input)
output_path = Path(args.output)
with input_path.open("rb") as f:
payload = pickle.load(f)
trees = payload["trees"] if isinstance(payload, dict) else payload
if not isinstance(trees, (list, tuple)):
trees = [trees]
tree = trees[args.term]
print("original", contraction_tree_costs(tree), flush=True)
sliced = tree.slice(
target_slices=args.target_slices,
max_repeats=args.max_repeats,
seed=args.seed,
)
print("sliced", contraction_tree_costs(sliced), flush=True)
print(f"sliced_inds={sliced.sliced_inds}", flush=True)
new_trees = list(trees)
new_trees[args.term] = sliced
if isinstance(payload, dict):
out_payload = dict(payload)
out_payload["trees"] = new_trees
out_payload["costs"] = [contraction_tree_costs(t) for t in new_trees]
out_payload["nterms"] = len(new_trees)
else:
out_payload = new_trees
output_path.parent.mkdir(parents=True, exist_ok=True)
with output_path.open("wb") as f:
pickle.dump(out_payload, f)
print(f"saved {output_path}", flush=True)
if __name__ == "__main__":
main()

435
tools/tn_contest_runner.py Normal file
View File

@@ -0,0 +1,435 @@
#!/usr/bin/env python
"""Contest-style CPU TN path search and contraction runner.
This file is intentionally self-contained: define contest circuits and
observables here, run path search once, then load the saved trees for repeated
MPI contractions.
"""
from __future__ import annotations
import argparse
import math
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlparse
import numpy as np
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
ROOT = Path(__file__).resolve().parents[1]
SRC = ROOT / "src"
if str(SRC) not in sys.path:
sys.path.insert(0, str(SRC))
from qibotn.expectation_runner import ( # noqa: E402
ExpectationConfig,
exact_for_observable,
run_cpu_expectation,
)
@dataclass(frozen=True)
class CaseSpec:
circuit_kind: str
observables: tuple[str, ...]
nqubits: int
nlayers: int
seed: int
target_slices: int | None = None
CASES = {
"main1": CaseSpec(
circuit_kind="rxx_rzz_chain",
observables=("ring_xz",),
nqubits=34,
nlayers=20,
seed=31001,
target_slices=None,
),
"main2": CaseSpec(
circuit_kind="scramble_chain",
observables=("open_zz", "range2_xx"),
nqubits=36,
nlayers=18,
seed=31002,
target_slices=None,
),
"strong": CaseSpec(
circuit_kind="reversed_cnot",
observables=("ring_xz", "long_z_string"),
nqubits=40,
nlayers=24,
seed=41001,
target_slices=None,
),
}
def optional_int(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return int(text)
def optional_float(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return float(text)
def set_torch_threads(nthreads):
try:
import torch
torch.set_num_threads(nthreads)
except Exception:
pass
def add_single_qubit_layer(circuit, nqubits, rng, include_rx=False):
for qubit in range(nqubits):
circuit.add(gates.RY(qubit, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(qubit, theta=rng.uniform(-math.pi, math.pi)))
if include_rx:
circuit.add(gates.RX(qubit, theta=rng.uniform(-math.pi, math.pi)))
def build_circuit(kind, nqubits, nlayers, seed):
"""Define contest circuits here."""
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
if kind == "rxx_rzz_chain":
add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.9, 0.9)))
elif kind == "scramble_chain":
add_single_qubit_layer(circuit, nqubits, rng, include_rx=True)
for qubit in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
circuit.add(gates.RZZ(qubit, qubit + 1, theta=rng.uniform(-0.8, 0.8)))
if layer % 5 == 4:
circuit.add(gates.SWAP(qubit, qubit + 1))
elif kind == "reversed_cnot":
add_single_qubit_layer(circuit, nqubits, rng)
for qubit in range(0, nqubits - 1, 2):
gate = gates.CNOT(qubit + 1, qubit) if layer % 2 else gates.CNOT(qubit, qubit + 1)
circuit.add(gate)
for qubit in range(1, nqubits - 1, 2):
gate = gates.CNOT(qubit + 1, qubit) if layer % 2 == 0 else gates.CNOT(qubit, qubit + 1)
circuit.add(gate)
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def pauli_sum_observable(kind, nqubits, seed):
"""Define contest observables here.
TN path currently expects Pauli products / SymbolicHamiltonian terms.
Keep production contest observables Hermitian unless complex output is
explicitly required by the scoring rule.
"""
del seed
if kind == "ring_xz":
form = 0
for qubit in range(nqubits):
form += 0.5 * X(qubit) * Z((qubit + 1) % nqubits)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "open_zz":
form = 0
for qubit in range(nqubits - 1):
form += (1.0 / max(1, nqubits - 1)) * Z(qubit) * Z(qubit + 1)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "range2_xx":
form = 0
for qubit in range(nqubits - 2):
form += (1.0 / max(1, nqubits - 2)) * X(qubit) * X(qubit + 2)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "long_z_string":
stride = max(1, nqubits // 16)
form = None
for qubit in range(0, nqubits, stride):
form = Z(qubit) if form is None else form * Z(qubit)
return hamiltonians.SymbolicHamiltonian(form=form)
if kind == "mixed_local":
q1 = nqubits // 4
q2 = nqubits // 2
q3 = (3 * nqubits) // 4
form = 0.25 * X(0) - 0.5 * Z(nqubits - 1)
form += 0.125 * X(q1) * Z(q2) * Y(q3)
return hamiltonians.SymbolicHamiltonian(form=form)
raise ValueError(f"Unknown observable kind {kind!r}.")
def tree_path(tree_dir, case_name, obs_name, nqubits, nlayers, target_slices):
slice_label = "auto" if target_slices is None else f"s{target_slices}"
return (
Path(tree_dir)
/ f"{case_name}_{obs_name}_{nqubits}q{nlayers}l_{slice_label}.pkl"
)
def build_parallel_opts(args, tree_file=None, search_only=False):
slicing_opts = {}
if args.tn_target_slices is not None:
slicing_opts["target_slices"] = args.tn_target_slices
if args.tn_target_size is not None:
slicing_opts["target_size"] = args.tn_target_size
opts = {
"slicing_opts": slicing_opts or None,
"search_workers": args.tn_search_workers or args.torch_threads,
"max_repeats": args.tn_search_repeats,
"max_time": args.tn_search_time,
"print_stats": not args.no_tn_stats,
}
if args.tn_search_backend is not None:
opts["search_backend"] = args.tn_search_backend
if args.dask_address is not None:
opts["dask_address"] = args.dask_address
if args.dask_close_workers:
opts["dask_close_workers"] = True
if args.tn_debug_trials:
opts["debug_trials"] = True
if search_only:
opts["search_only"] = True
opts["save_tree_path"] = str(tree_file)
elif tree_file is not None:
opts["load_tree_path"] = str(tree_file)
return opts
def run_one(args, case_name, obs_name, mode):
case = CASES[case_name]
circuit = build_circuit(case.circuit_kind, args.nqubits, args.nlayers, args.seed)
observable = pauli_sum_observable(obs_name, args.nqubits, args.seed)
path = tree_path(
args.tree_dir,
case_name,
obs_name,
args.nqubits,
args.nlayers,
args.tn_target_slices,
)
path.parent.mkdir(parents=True, exist_ok=True)
rank = 0
if args.mpi:
from mpi4py import MPI
rank = MPI.COMM_WORLD.Get_rank()
if rank == 0:
print("=" * 88, flush=True)
print(
f"mode={mode} case={case_name} circuit={case.circuit_kind} "
f"observable={obs_name} nqubits={args.nqubits} nlayers={args.nlayers} "
f"seed={args.seed} gates={len(circuit.queue)} tree={path}",
flush=True,
)
if mode == "contract" and not path.exists():
raise FileNotFoundError(f"Missing tree file: {path}. Run search first.")
exact = None
if args.exact and rank == 0 and mode != "search":
if args.nqubits > args.exact_max_qubits:
raise ValueError(
f"--exact is limited to {args.exact_max_qubits} qubits by default."
)
exact = exact_for_observable(circuit, observable, args.nqubits)
config = ExpectationConfig(
ansatz="tn",
mpi=args.mpi,
bond=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
quimb_backend=args.quimb_backend,
dtype=args.dtype,
torch_threads=args.torch_threads,
parallel_opts=build_parallel_opts(
args,
tree_file=path,
search_only=(mode == "search"),
),
)
result = run_cpu_expectation(circuit, observable, config)
if args.mpi and result.rank != 0:
return
if mode == "search":
print(f"searched observable={obs_name} tree={path}", flush=True)
else:
abs_error = float("nan") if exact is None else abs(result.value - exact)
rel_error = float("nan") if exact is None else abs_error / max(abs(exact), 1e-15)
exact_text = "nan" if exact is None else f"{exact:.16e}"
print(
f"result observable={obs_name} exact={exact_text} "
f"value={result.value:.16e} abs_error={abs_error:.6e} "
f"rel_error={rel_error:.6e} seconds={result.seconds:.3f}",
flush=True,
)
for stat in result.parallel_stats or ():
cost = stat["path_cost"]
search_stats = stat.get("search_stats", {})
print(
"tn_term_summary "
f"observable={obs_name} "
f"term={stat.get('term_index', 0)} "
f"search_seconds={stat.get('search_seconds', float('nan')):.3f} "
f"contract_seconds={stat.get('contract_seconds', float('nan')):.3f} "
f"completed_trials={search_stats.get('completed_trials', 'na')} "
f"finite_trials={search_stats.get('finite_trials', 'na')} "
f"failed_trials={search_stats.get('failed_trials', 'na')} "
f"requested_trials={search_stats.get('requested_trials', 'na')} "
f"best_score={search_stats.get('best_score', float('nan')):.6g} "
f"slices={cost.get('slices')} "
f"log10_flops={cost.get('log10_flops', float('nan')):.3f} "
f"log10_write={cost.get('log10_write', float('nan')):.3f} "
f"log2_size={cost.get('log2_size', float('nan')):.3f} "
f"peak_memory_gib={cost.get('peak_memory_gib', float('nan')):.3g} "
f"rank_slices={stat.get('rank_slices')}",
flush=True,
)
def selected_observables(args, case):
if args.observables:
return tuple(args.observables)
if args.obs_filter:
return tuple(x.strip() for x in args.obs_filter.split(",") if x.strip())
return case.observables
def apply_case_defaults(args):
case = CASES[args.case]
if args.nqubits is None:
args.nqubits = case.nqubits
if args.nlayers is None:
args.nlayers = case.nlayers
if args.seed is None:
args.seed = case.seed
if args.tn_target_slices is None:
args.tn_target_slices = case.target_slices
args.observables = selected_observables(args, case)
def stop_dask_cluster(args):
if args.keep_dask or args.tn_search_backend != "dask" or not args.dask_address:
return
script = ROOT / "tools" / "manage_tn_dask_cluster.sh"
if not script.exists():
print(f"dask_stop_skipped reason=missing_script path={script}", flush=True)
return
env = os.environ.copy()
parsed = urlparse(args.dask_address)
if parsed.hostname:
env.setdefault("SCHEDULER_HOST", parsed.hostname)
if parsed.port:
env.setdefault("SCHEDULER_PORT", str(parsed.port))
print("dask_stop_after_search start", flush=True)
subprocess.run([str(script), "stop"], cwd=str(ROOT), env=env, check=False)
print("dask_stop_after_search done", flush=True)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("mode", choices=("search", "contract", "all", "validate", "list"))
parser.add_argument("--case", choices=sorted(CASES), default="main1")
parser.add_argument("--observables", nargs="+")
parser.add_argument("--obs-filter", default="")
parser.add_argument("--tree-dir", default="trees/contest_tn")
parser.add_argument("--nqubits", type=int)
parser.add_argument("--nlayers", type=int)
parser.add_argument("--seed", type=int)
parser.add_argument("--mpi", action="store_true")
parser.add_argument("--exact", action="store_true")
parser.add_argument("--exact-max-qubits", type=int, default=24)
parser.add_argument("--bond", "--bonds", dest="bond", type=optional_int, default=1024)
parser.add_argument("--cut-ratio", type=optional_float, default=1e-12)
parser.add_argument("--torch-threads", type=int, default=8)
parser.add_argument("--quimb-backend", choices=("numpy", "torch"), default="torch")
parser.add_argument("--dtype", choices=("complex128", "complex64"), default="complex64")
parser.add_argument("--tn-target-slices", type=int)
parser.add_argument("--tn-target-size", type=int, default=2**32)
parser.add_argument("--tn-search-workers", type=int)
parser.add_argument("--tn-search-repeats", type=int, default=2048)
parser.add_argument("--tn-search-time", type=float, default=300.0)
parser.add_argument(
"--tn-search-backend",
choices=("processpool", "dask"),
default="dask",
help=(
"Path-search backend. Defaults to dask. Without --dask-address, "
"non-MPI search starts a local dask cluster."
),
)
parser.add_argument("--dask-address")
parser.add_argument("--dask-close-workers", action="store_true")
parser.add_argument(
"--keep-dask",
action="store_true",
help=(
"Keep an external dask cluster running after search. By default, "
"tools/manage_tn_dask_cluster.sh stop is called after search when "
"--dask-address is used."
),
)
parser.add_argument(
"--tn-debug-trials",
action="store_true",
help="Print dask worker summary and per-trial start/done logs.",
)
parser.add_argument("--no-tn-stats", action="store_true")
args = parser.parse_args()
if args.mode == "list":
for name, case in CASES.items():
print(
f"{name}: circuit={case.circuit_kind} "
f"observables={','.join(case.observables)} "
f"nqubits={case.nqubits} nlayers={case.nlayers} "
f"seed={case.seed} target_slices={case.target_slices}"
)
return
apply_case_defaults(args)
set_torch_threads(args.torch_threads)
modes = ("search", "contract") if args.mode == "all" else (args.mode,)
if args.mode == "validate":
args.exact = True
args.nqubits = min(args.nqubits, args.exact_max_qubits)
modes = ("search", "contract")
for mode in modes:
for obs_name in args.observables:
run_one(args, args.case, obs_name, mode)
if mode == "search":
stop_dask_cluster(args)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,114 @@
"""Run the 34q/20L TN complex64 benchmark under torch.profiler briefly."""
from __future__ import annotations
import argparse
import os
import signal
import sys
from pathlib import Path
from mpi4py import MPI
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--seconds", type=float, default=30.0)
parser.add_argument("--out-dir", default="torch_profiles/tn_complex64")
parser.add_argument("--torch-threads", type=int, default=48)
args = parser.parse_args()
repo_root = Path(__file__).resolve().parents[1]
os.chdir(repo_root)
sys.path.insert(0, str(repo_root))
import torch
from torch.profiler import ProfilerActivity, profile
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
out_dir = Path(args.out_dir)
if rank == 0:
out_dir.mkdir(parents=True, exist_ok=True)
comm.Barrier()
torch.set_num_threads(args.torch_threads)
def run_benchmark():
import benchmark_cpu_expectation
sys.argv = [
"benchmark_cpu_expectation.py",
"--mpi",
"--ansatz",
"tn",
"--nqubits",
"34",
"--nlayers",
"20",
"--circuits",
"rxx_rzz",
"--pauli-pattern",
"XZ",
"--tn-load-tree",
"trees/rxx_rzz_34q20l_s4.pkl",
"--quimb-backend",
"torch",
"--torch-threads",
str(args.torch_threads),
"--dtype",
"complex64",
]
benchmark_cpu_expectation.main()
trace_path = out_dir / f"rank{rank}_trace.json"
stacks_path = out_dir / f"rank{rank}_stacks.txt"
summary_path = out_dir / f"rank{rank}_summary.txt"
prof = profile(
activities=[ProfilerActivity.CPU],
record_shapes=True,
profile_memory=True,
with_stack=True,
)
class ProfileTimeout(Exception):
pass
def alarm_handler(signum, frame):
raise ProfileTimeout()
old_handler = signal.signal(signal.SIGALRM, alarm_handler)
signal.setitimer(signal.ITIMER_REAL, args.seconds)
try:
with prof:
try:
run_benchmark()
except ProfileTimeout:
pass
finally:
signal.setitimer(signal.ITIMER_REAL, 0)
signal.signal(signal.SIGALRM, old_handler)
prof.export_chrome_trace(str(trace_path))
try:
prof.export_stacks(str(stacks_path), "self_cpu_time_total")
except Exception as exc: # pragma: no cover - diagnostic only
stacks_path.write_text(f"export_stacks failed: {exc}\n", encoding="utf-8")
summary = prof.key_averages(group_by_stack_n=5).table(
sort_by="self_cpu_time_total",
row_limit=40,
)
summary_path.write_text(summary, encoding="utf-8")
print(
f"torch_profile_done rank={rank}/{size} "
f"trace={trace_path} summary={summary_path}",
flush=True,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,209 @@
from __future__ import annotations
import argparse
import math
import time
import numpy as np
from mpi4py import MPI
from qibo import Circuit, gates, hamiltonians
from qibo.symbols import X, Y, Z
from qibotn.backends.vidal import VidalBackend
def optional_int(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return int(text)
def optional_float(text):
if isinstance(text, str) and text.lower() in {"none", "null", "inf", "unlimited"}:
return None
return float(text)
def format_optional(value, fmt="g"):
return "None" if value is None else format(value, fmt)
def set_torch_threads(nthreads):
try:
import torch
torch.set_num_threads(nthreads)
except Exception:
pass
def build_circuit(kind, nqubits, nlayers, seed):
rng = np.random.default_rng(seed)
circuit = Circuit(nqubits)
for layer in range(nlayers):
for q in range(nqubits):
circuit.add(gates.RY(q, theta=rng.uniform(-math.pi, math.pi)))
circuit.add(gates.RZ(q, theta=rng.uniform(-math.pi, math.pi)))
if kind in ("rxx_rzz", "scramble"):
circuit.add(gates.RX(q, theta=rng.uniform(-math.pi, math.pi)))
if kind == "reversed_cnot":
for q in range(0, nqubits - 1, 2):
circuit.add(gates.CNOT(q + 1, q) if layer % 2 else gates.CNOT(q, q + 1))
for q in range(1, nqubits - 1, 2):
circuit.add(gates.CNOT(q + 1, q) if layer % 2 == 0 else gates.CNOT(q, q + 1))
elif kind == "rxx_rzz":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.9, 0.9)))
elif kind == "scramble":
for q in range(layer % 2, nqubits - 1, 2):
circuit.add(gates.RXX(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
circuit.add(gates.RZZ(q, q + 1, theta=rng.uniform(-0.8, 0.8)))
if layer % 5 == 4:
circuit.add(gates.SWAP(q, q + 1))
else:
raise ValueError(f"Unknown circuit kind {kind!r}.")
return circuit
def ring_xz(nqubits):
form = 0
for q in range(nqubits):
form += 0.5 * X(q) * Z((q + 1) % nqubits)
return hamiltonians.SymbolicHamiltonian(form=form)
def open_zz(nqubits):
form = 0
for q in range(nqubits - 1):
form += (1.0 / (nqubits - 1)) * Z(q) * Z(q + 1)
return hamiltonians.SymbolicHamiltonian(form=form)
def range2_xx(nqubits):
form = 0
for q in range(nqubits - 2):
form += (1.0 / (nqubits - 2)) * X(q) * X(q + 2)
return hamiltonians.SymbolicHamiltonian(form=form)
def dense_observable(nqubits, qubits, seed, dim):
rng = np.random.default_rng(seed)
raw = rng.normal(size=(dim, dim)) + 1j * rng.normal(size=(dim, dim))
matrix = (raw + raw.conj().T) / 2.0
matrix = matrix / np.linalg.norm(matrix)
return {"matrix": matrix, "qubits": list(qubits)}
def observables_for_case(nqubits, seed):
q1 = nqubits // 4
q2 = nqubits // 2
q3 = (3 * nqubits) // 4
last = nqubits - 1
return [
("boundary_ZZ_q1", hamiltonians.SymbolicHamiltonian(form=Z(q1 - 1) * Z(q1))),
("boundary_ZZ_q2", hamiltonians.SymbolicHamiltonian(form=Z(q2 - 1) * Z(q2))),
("boundary_ZZ_q3", hamiltonians.SymbolicHamiltonian(form=Z(q3 - 1) * Z(q3))),
(
"long_Z_5_sites",
hamiltonians.SymbolicHamiltonian(form=Z(0) * Z(q1) * Z(q2) * Z(q3) * Z(last)),
),
(
"mixed_XZYZX",
hamiltonians.SymbolicHamiltonian(form=X(0) * Z(q1) * Y(q2) * Z(q3) * X(last)),
),
("ring_xz", ring_xz(nqubits)),
("open_zz", open_zz(nqubits)),
("range2_xx", range2_xx(nqubits)),
("complex_iZ0", hamiltonians.SymbolicHamiltonian(form=1.0j * Z(0))),
("dense2_mid", dense_observable(nqubits, (q2 - 1, q2), seed + 101, 4)),
("dense3_spread", dense_observable(nqubits, (q1, q2, q3), seed + 202, 8)),
]
def run_case(args):
set_torch_threads(args.torch_threads)
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
circuit = build_circuit(args.kind, args.nqubits, args.nlayers, args.seed)
observables = observables_for_case(args.nqubits, args.seed)
if args.obs_filter:
wanted = set(args.obs_filter.split(","))
observables = [(name, obs) for name, obs in observables if name in wanted]
if not observables:
raise ValueError(f"OBS_FILTER matched no observables: {args.obs_filter!r}")
if rank == 0:
print("=" * 88, flush=True)
print(
"case "
f"label={args.label} kind={args.kind} ranks={size} "
f"nqubits={args.nqubits} nlayers={args.nlayers} gates={len(circuit.queue)} "
f"bond={format_optional(args.bond)} "
f"cut_ratio={format_optional(args.cut_ratio)} "
f"torch_threads={args.torch_threads} seed={args.seed} "
f"obs_filter={args.obs_filter or 'all'}",
flush=True,
)
print(
"observable value seconds trunc_sum trunc_max status",
flush=True,
)
for obs_name, observable in observables:
backend = VidalBackend()
backend.configure_tn_simulation(
max_bond_dimension=args.bond,
cut_ratio=args.cut_ratio,
tensor_module="torch",
mpi_approach="CT",
mpi_num_procs=size,
fallback=False,
)
comm.Barrier()
start = time.perf_counter()
try:
value = backend.expectation(
circuit,
observable,
preprocess=True,
compile_circuit=False,
)
status = "ok"
except Exception as exc: # pragma: no cover - printed for manual runs
value = np.nan
status = type(exc).__name__ + ":" + str(exc).split("\n", 1)[0]
seconds = time.perf_counter() - start
if rank == 0:
print(
f"{obs_name} {value!r} {seconds:.3f} "
f"{backend.last_truncation_error:.6e} "
f"{backend.last_max_truncation_error:.6e} {status}",
flush=True,
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--label", required=True)
parser.add_argument("--kind", choices=("reversed_cnot", "rxx_rzz", "scramble"), required=True)
parser.add_argument("--nqubits", type=int, required=True)
parser.add_argument("--nlayers", type=int, required=True)
parser.add_argument("--bond", type=optional_int, required=True)
parser.add_argument("--cut-ratio", type=optional_float, required=True)
parser.add_argument("--seed", type=int, required=True)
parser.add_argument("--torch-threads", type=int, required=True)
parser.add_argument("--obs-filter", default="")
run_case(parser.parse_args())
if __name__ == "__main__":
main()

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
trees/rxx_rzz_30q20l.pkl Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
trees/rxx_rzz_34q20l_s4.pkl Normal file

Binary file not shown.